184 files changed, 30452 insertions, 16999 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75d65f..0b72d1a74be 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-	    async.o
+	    async.o range.o
+obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
 obj-y += groups.o
 
 ifdef CONFIG_FUNCTION_TRACER
@@ -67,16 +68,17 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
-obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
-obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_KGDB) += kgdb.o
-obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
+obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
@@ -90,16 +92,18 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
-obj-$(CONFIG_SLOW_WORK) += slow-work.o
-obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_PADATA) += padata.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b..fa7eb3de2dd 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
 	spin_unlock(&acct_lock);
 
 	/* May block */
-	if (vfs_statfs(file->f_path.dentry, &sbuf))
+	if (vfs_statfs(&file->f_path, &sbuf))
 		return res;
 	suspend = sbuf.f_blocks * SUSPEND;
 	resume = sbuf.f_blocks * RESUME;
@@ -216,7 +216,6 @@ static int acct_on(char *name)
 {
 	struct file *file;
 	struct vfsmount *mnt;
-	int error;
 	struct pid_namespace *ns;
 	struct bsd_acct_struct *acct = NULL;
 
@@ -244,13 +243,6 @@ static int acct_on(char *name)
 		}
 	}
 
-	error = security_acct(file);
-	if (error) {
-		kfree(acct);
-		filp_close(file, NULL);
-		return error;
-	}
-
 	spin_lock(&acct_lock);
 	if (ns->bacct == NULL) {
 		ns->bacct = acct;
@@ -281,7 +273,7 @@ static int acct_on(char *name)
  */
 SYSCALL_DEFINE1(acct, const char __user *, name)
 {
-	int error;
+	int error = 0;
 
 	if (!capable(CAP_SYS_PACCT))
 		return -EPERM;
@@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 		if (acct == NULL)
 			return 0;
 
-		error = security_acct(NULL);
-		if (!error) {
-			spin_lock(&acct_lock);
-			acct_file_reopen(acct, NULL, NULL);
-			spin_unlock(&acct_lock);
-		}
+		spin_lock(&acct_lock);
+		acct_file_reopen(acct, NULL, NULL);
+		spin_unlock(&acct_lock);
 	}
+
 	return error;
 }
 
@@ -353,17 +343,18 @@ restart:
 
 void acct_exit_ns(struct pid_namespace *ns)
 {
-	struct bsd_acct_struct *acct;
+	struct bsd_acct_struct *acct = ns->bacct;
 
-	spin_lock(&acct_lock);
-	acct = ns->bacct;
-	if (acct != NULL) {
-		if (acct->file != NULL)
-			acct_file_reopen(acct, NULL, NULL);
+	if (acct == NULL)
+		return;
 
-		kfree(acct);
-	}
+	del_timer_sync(&acct->timer);
+	spin_lock(&acct_lock);
+	if (acct->file != NULL)
+		acct_file_reopen(acct, NULL, NULL);
 	spin_unlock(&acct_lock);
+
+	kfree(acct);
 }
 
 /*
@@ -588,16 +579,6 @@ out:
 }
 
 /**
- * acct_init_pacct - initialize a new pacct_struct
- * @pacct: per-process accounting info struct to initialize
- */
-void acct_init_pacct(struct pacct_struct *pacct)
-{
-	memset(pacct, 0, sizeof(struct pacct_struct));
-	pacct->ac_utime = pacct->ac_stime = cputime_zero;
-}
-
-/**
  * acct_collect - collect accounting information into pacct_struct
  * @exitcode: task exit code
  * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/async.c b/kernel/async.c
index 27235f5de19..cd9dbb913c7 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,39 +49,33 @@ asynchronous and synchronous parts of the kernel.
 */
 
 #include <linux/async.h>
-#include <linux/bug.h>
 #include <linux/module.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
 #include <asm/atomic.h>
 
 static async_cookie_t next_cookie = 1;
 
-#define MAX_THREADS	256
 #define MAX_WORK	32768
 
 static LIST_HEAD(async_pending);
 static LIST_HEAD(async_running);
 static DEFINE_SPINLOCK(async_lock);
 
-static int async_enabled = 0;
-
 struct async_entry {
-	struct list_head list;
-	async_cookie_t   cookie;
-	async_func_ptr	 *func;
-	void             *data;
-	struct list_head *running;
+	struct list_head	list;
+	struct work_struct	work;
+	async_cookie_t		cookie;
+	async_func_ptr		*func;
+	void			*data;
+	struct list_head	*running;
 };
 
 static DECLARE_WAIT_QUEUE_HEAD(async_done);
-static DECLARE_WAIT_QUEUE_HEAD(async_new);
 
 static atomic_t entry_count;
-static atomic_t thread_count;
 
 extern int initcall_debug;
 
@@ -116,27 +110,23 @@ static async_cookie_t  lowest_in_progress(struct list_head *running)
 	spin_unlock_irqrestore(&async_lock, flags);
 	return ret;
 }
+
 /*
  * pick the first pending entry and run it
  */
-static void run_one_entry(void)
+static void async_run_entry_fn(struct work_struct *work)
 {
+	struct async_entry *entry =
+		container_of(work, struct async_entry, work);
 	unsigned long flags;
-	struct async_entry *entry;
 	ktime_t calltime, delta, rettime;
 
-	/* 1) pick one task from the pending queue */
-
+	/* 1) move self to the running queue */
 	spin_lock_irqsave(&async_lock, flags);
-	if (list_empty(&async_pending))
-		goto out;
-	entry = list_first_entry(&async_pending, struct async_entry, list);
-
-	/* 2) move it to the running queue */
 	list_move_tail(&entry->list, entry->running);
 	spin_unlock_irqrestore(&async_lock, flags);
 
-	/* 3) run it (and print duration)*/
+	/* 2) run (and print duration) */
 	if (initcall_debug && system_state == SYSTEM_BOOTING) {
 		printk("calling  %lli_%pF @ %i\n", (long long)entry->cookie,
 			entry->func, task_pid_nr(current));
@@ -152,31 +142,25 @@ static void run_one_entry(void)
 			(long long)ktime_to_ns(delta) >> 10);
 	}
 
-	/* 4) remove it from the running queue */
+	/* 3) remove self from the running queue */
 	spin_lock_irqsave(&async_lock, flags);
 	list_del(&entry->list);
 
-	/* 5) free the entry  */
+	/* 4) free the entry */
 	kfree(entry);
 	atomic_dec(&entry_count);
 
 	spin_unlock_irqrestore(&async_lock, flags);
 
-	/* 6) wake up any waiters. */
+	/* 5) wake up any waiters */
 	wake_up(&async_done);
-	return;
-
-out:
-	spin_unlock_irqrestore(&async_lock, flags);
 }
 
-
 static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
 {
 	struct async_entry *entry;
 	unsigned long flags;
 	async_cookie_t newcookie;
-	
 
 	/* allow irq-off callers */
 	entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
@@ -185,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
 	 * If we're out of memory or if there's too much work
 	 * pending already, we execute synchronously.
 	 */
-	if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
+	if (!entry || atomic_read(&entry_count) > MAX_WORK) {
 		kfree(entry);
 		spin_lock_irqsave(&async_lock, flags);
 		newcookie = next_cookie++;
@@ -195,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
 		ptr(data, newcookie);
 		return newcookie;
 	}
+	INIT_WORK(&entry->work, async_run_entry_fn);
 	entry->func = ptr;
 	entry->data = data;
 	entry->running = running;
@@ -204,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
 	list_add_tail(&entry->list, &async_pending);
 	atomic_inc(&entry_count);
 	spin_unlock_irqrestore(&async_lock, flags);
-	wake_up(&async_new);
+
+	/* schedule for execution */
+	queue_work(system_unbound_wq, &entry->work);
+
 	return newcookie;
 }
 
@@ -311,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie)
 	async_synchronize_cookie_domain(cookie, &async_running);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_cookie);
-
-
-static int async_thread(void *unused)
-{
-	DECLARE_WAITQUEUE(wq, current);
-	add_wait_queue(&async_new, &wq);
-
-	while (!kthread_should_stop()) {
-		int ret = HZ;
-		set_current_state(TASK_INTERRUPTIBLE);
-		/*
-		 * check the list head without lock.. false positives
-		 * are dealt with inside run_one_entry() while holding
-		 * the lock.
-		 */
-		rmb();
-		if (!list_empty(&async_pending))
-			run_one_entry();
-		else
-			ret = schedule_timeout(HZ);
-
-		if (ret == 0) {
-			/*
-			 * we timed out, this means we as thread are redundant.
-			 * we sign off and die, but we to avoid any races there
-			 * is a last-straw check to see if work snuck in.
-			 */
-			atomic_dec(&thread_count);
-			wmb(); /* manager must see our departure first */
-			if (list_empty(&async_pending))
-				break;
-			/*
-			 * woops work came in between us timing out and us
-			 * signing off; we need to stay alive and keep working.
-			 */
-			atomic_inc(&thread_count);
-		}
-	}
-	remove_wait_queue(&async_new, &wq);
-
-	return 0;
-}
-
-static int async_manager_thread(void *unused)
-{
-	DECLARE_WAITQUEUE(wq, current);
-	add_wait_queue(&async_new, &wq);
-
-	while (!kthread_should_stop()) {
-		int tc, ec;
-
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		tc = atomic_read(&thread_count);
-		rmb();
-		ec = atomic_read(&entry_count);
-
-		while (tc < ec && tc < MAX_THREADS) {
-			if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
-					       tc))) {
-				msleep(100);
-				continue;
-			}
-			atomic_inc(&thread_count);
-			tc++;
-		}
-
-		schedule();
-	}
-	remove_wait_queue(&async_new, &wq);
-
-	return 0;
-}
-
-static int __init async_init(void)
-{
-	async_enabled =
-		!IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
-
-	WARN_ON(!async_enabled);
-	return 0;
-}
-
-core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9..d96045789b5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,7 @@
 #include <asm/atomic.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/kthread.h>
 
@@ -55,7 +56,6 @@
 #include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
-#include <linux/inotify.h>
 #include <linux/freezer.h>
 #include <linux/tty.h>
 
@@ -398,7 +398,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
 	skb_get(skb);
 	err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
 	if (err < 0) {
-		BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+		BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
 		printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
 		audit_log_lost("auditd dissapeared\n");
 		audit_pid = 0;
@@ -406,7 +406,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
 		audit_hold_skb(skb);
 	} else
 		/* drop the extra reference if sent ok */
-		kfree_skb(skb);
+		consume_skb(skb);
 }
 
 static int kauditd_thread(void *dummy)
diff --git a/kernel/audit.h b/kernel/audit.h
index 208687be4f3..f7206db4e13 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex;
 extern void audit_free_rule_rcu(struct rcu_head *);
 extern struct list_head audit_filter_list[];
 
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
+
 /* audit watch functions */
-extern unsigned long audit_watch_inode(struct audit_watch *watch);
-extern dev_t audit_watch_dev(struct audit_watch *watch);
+#ifdef CONFIG_AUDIT_WATCH
 extern void audit_put_watch(struct audit_watch *watch);
 extern void audit_get_watch(struct audit_watch *watch);
 extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
-extern int audit_add_watch(struct audit_krule *krule);
-extern void audit_remove_watch(struct audit_watch *watch);
-extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
-extern void audit_inotify_unregister(struct list_head *in_list);
+extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
+extern void audit_remove_watch_rule(struct audit_krule *krule);
 extern char *audit_watch_path(struct audit_watch *watch);
-extern struct list_head *audit_watch_rules(struct audit_watch *watch);
-
-extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
-					   struct audit_watch *watch);
+extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+#else
+#define audit_put_watch(w) {}
+#define audit_get_watch(w) {}
+#define audit_to_watch(k, p, l, o) (-EINVAL)
+#define audit_add_watch(k, l) (-EINVAL)
+#define audit_remove_watch_rule(k) BUG()
+#define audit_watch_path(w) ""
+#define audit_watch_compare(w, i, d) 0
+
+#endif /* CONFIG_AUDIT_WATCH */
 
 #ifdef CONFIG_AUDIT_TREE
 extern struct audit_chunk *audit_tree_lookup(const struct inode *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479d..7f18d3a4527 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,8 +1,9 @@
 #include "audit.h"
-#include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 
 struct audit_tree;
 struct audit_chunk;
@@ -21,7 +22,7 @@ struct audit_tree {
 
 struct audit_chunk {
 	struct list_head hash;
-	struct inotify_watch watch;
+	struct fsnotify_mark mark;
 	struct list_head trees;		/* with root here */
 	int dead;
 	int count;
@@ -58,7 +59,7 @@ static LIST_HEAD(prune_list);
  * tree is refcounted; one reference for "some rules on rules_list refer to
  * it", one for each chunk with pointer to it.
  *
- * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
  * of watch contributes 1 to .refs).
  *
  * node.index allows to get from node.list to containing chunk.
@@ -67,7 +68,7 @@ static LIST_HEAD(prune_list);
  * that makes a difference.  Some.
  */
 
-static struct inotify_handle *rtree_ih;
+static struct fsnotify_group *audit_tree_group;
 
 static struct audit_tree *alloc_tree(const char *s)
 {
@@ -110,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree)
 	return tree->pathname;
 }
 
-static struct audit_chunk *alloc_chunk(int count)
-{
-	struct audit_chunk *chunk;
-	size_t size;
-	int i;
-
-	size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
-	chunk = kzalloc(size, GFP_KERNEL);
-	if (!chunk)
-		return NULL;
-
-	INIT_LIST_HEAD(&chunk->hash);
-	INIT_LIST_HEAD(&chunk->trees);
-	chunk->count = count;
-	atomic_long_set(&chunk->refs, 1);
-	for (i = 0; i < count; i++) {
-		INIT_LIST_HEAD(&chunk->owners[i].list);
-		chunk->owners[i].index = i;
-	}
-	inotify_init_watch(&chunk->watch);
-	return chunk;
-}
-
 static void free_chunk(struct audit_chunk *chunk)
 {
 	int i;
@@ -156,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu)
 	audit_put_chunk(chunk);
 }
 
+static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
+{
+	struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+	call_rcu(&chunk->head, __put_chunk);
+}
+
+static struct audit_chunk *alloc_chunk(int count)
+{
+	struct audit_chunk *chunk;
+	size_t size;
+	int i;
+
+	size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
+	chunk = kzalloc(size, GFP_KERNEL);
+	if (!chunk)
+		return NULL;
+
+	INIT_LIST_HEAD(&chunk->hash);
+	INIT_LIST_HEAD(&chunk->trees);
+	chunk->count = count;
+	atomic_long_set(&chunk->refs, 1);
+	for (i = 0; i < count; i++) {
+		INIT_LIST_HEAD(&chunk->owners[i].list);
+		chunk->owners[i].index = i;
+	}
+	fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+	return chunk;
+}
+
 enum {HASH_SIZE = 128};
 static struct list_head chunk_hash_heads[HASH_SIZE];
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -166,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
 	return chunk_hash_heads + n % HASH_SIZE;
 }
 
-/* hash_lock is held by caller */
+/* hash_lock & entry->lock is held by caller */
 static void insert_hash(struct audit_chunk *chunk)
 {
-	struct list_head *list = chunk_hash(chunk->watch.inode);
+	struct fsnotify_mark *entry = &chunk->mark;
+	struct list_head *list;
+
+	if (!entry->i.inode)
+		return;
+	list = chunk_hash(entry->i.inode);
 	list_add_rcu(&chunk->hash, list);
 }
 
@@ -180,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
 	struct audit_chunk *p;
 
 	list_for_each_entry_rcu(p, list, hash) {
-		if (p->watch.inode == inode) {
+		/* mark.inode may have gone NULL, but who cares? */
+		if (p->mark.i.inode == inode) {
 			atomic_long_inc(&p->refs);
 			return p;
 		}
@@ -209,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p)
 static void untag_chunk(struct node *p)
 {
 	struct audit_chunk *chunk = find_chunk(p);
+	struct fsnotify_mark *entry = &chunk->mark;
 	struct audit_chunk *new;
 	struct audit_tree *owner;
 	int size = chunk->count - 1;
 	int i, j;
 
-	if (!pin_inotify_watch(&chunk->watch)) {
-		/*
-		 * Filesystem is shutting down; all watches are getting
-		 * evicted, just take it off the node list for this
-		 * tree and let the eviction logics take care of the
-		 * rest.
-		 */
-		owner = p->owner;
-		if (owner->root == chunk) {
-			list_del_init(&owner->same_root);
-			owner->root = NULL;
-		}
-		list_del_init(&p->list);
-		p->owner = NULL;
-		put_tree(owner);
-		return;
-	}
+	fsnotify_get_mark(entry);
 
 	spin_unlock(&hash_lock);
 
-	/*
-	 * pin_inotify_watch() succeeded, so the watch won't go away
-	 * from under us.
-	 */
-	mutex_lock(&chunk->watch.inode->inotify_mutex);
-	if (chunk->dead) {
-		mutex_unlock(&chunk->watch.inode->inotify_mutex);
+	spin_lock(&entry->lock);
+	if (chunk->dead || !entry->i.inode) {
+		spin_unlock(&entry->lock);
 		goto out;
 	}
 
@@ -255,16 +249,17 @@ static void untag_chunk(struct node *p)
 		list_del_init(&p->list);
 		list_del_rcu(&chunk->hash);
 		spin_unlock(&hash_lock);
-		inotify_evict_watch(&chunk->watch);
-		mutex_unlock(&chunk->watch.inode->inotify_mutex);
-		put_inotify_watch(&chunk->watch);
+		spin_unlock(&entry->lock);
+		fsnotify_destroy_mark(entry);
+		fsnotify_put_mark(entry);
 		goto out;
 	}
 
 	new = alloc_chunk(size);
 	if (!new)
 		goto Fallback;
-	if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) {
+	fsnotify_duplicate_mark(&new->mark, entry);
+	if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
 		free_chunk(new);
 		goto Fallback;
 	}
@@ -297,9 +292,9 @@ static void untag_chunk(struct node *p)
 	list_for_each_entry(owner, &new->trees, same_root)
 		owner->root = new;
 	spin_unlock(&hash_lock);
-	inotify_evict_watch(&chunk->watch);
-	mutex_unlock(&chunk->watch.inode->inotify_mutex);
-	put_inotify_watch(&chunk->watch);
+	spin_unlock(&entry->lock);
+	fsnotify_destroy_mark(entry);
+	fsnotify_put_mark(entry);
 	goto out;
 
 Fallback:
@@ -313,31 +308,33 @@ Fallback:
 	p->owner = NULL;
 	put_tree(owner);
 	spin_unlock(&hash_lock);
-	mutex_unlock(&chunk->watch.inode->inotify_mutex);
+	spin_unlock(&entry->lock);
 out:
-	unpin_inotify_watch(&chunk->watch);
+	fsnotify_put_mark(entry);
 	spin_lock(&hash_lock);
 }
 
 static int create_chunk(struct inode *inode, struct audit_tree *tree)
 {
+	struct fsnotify_mark *entry;
 	struct audit_chunk *chunk = alloc_chunk(1);
 	if (!chunk)
 		return -ENOMEM;
 
-	if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) {
+	entry = &chunk->mark;
+	if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
 		free_chunk(chunk);
 		return -ENOSPC;
 	}
 
-	mutex_lock(&inode->inotify_mutex);
+	spin_lock(&entry->lock);
 	spin_lock(&hash_lock);
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
 		chunk->dead = 1;
-		inotify_evict_watch(&chunk->watch);
-		mutex_unlock(&inode->inotify_mutex);
-		put_inotify_watch(&chunk->watch);
+		spin_unlock(&entry->lock);
+		fsnotify_destroy_mark(entry);
+		fsnotify_put_mark(entry);
 		return 0;
 	}
 	chunk->owners[0].index = (1U << 31);
@@ -350,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 	}
 	insert_hash(chunk);
 	spin_unlock(&hash_lock);
-	mutex_unlock(&inode->inotify_mutex);
+	spin_unlock(&entry->lock);
 	return 0;
 }
 
 /* the first tagged inode becomes root of tree */
 static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 {
-	struct inotify_watch *watch;
+	struct fsnotify_mark *old_entry, *chunk_entry;
 	struct audit_tree *owner;
 	struct audit_chunk *chunk, *old;
 	struct node *p;
 	int n;
 
-	if (inotify_find_watch(rtree_ih, inode, &watch) < 0)
+	old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
+	if (!old_entry)
 		return create_chunk(inode, tree);
 
-	old = container_of(watch, struct audit_chunk, watch);
+	old = container_of(old_entry, struct audit_chunk, mark);
 
 	/* are we already there? */
 	spin_lock(&hash_lock);
 	for (n = 0; n < old->count; n++) {
 		if (old->owners[n].owner == tree) {
 			spin_unlock(&hash_lock);
-			put_inotify_watch(&old->watch);
+			fsnotify_put_mark(old_entry);
 			return 0;
 		}
 	}
@@ -381,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 
 	chunk = alloc_chunk(old->count + 1);
 	if (!chunk) {
-		put_inotify_watch(&old->watch);
+		fsnotify_put_mark(old_entry);
 		return -ENOMEM;
 	}
 
-	mutex_lock(&inode->inotify_mutex);
-	if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
-		mutex_unlock(&inode->inotify_mutex);
-		put_inotify_watch(&old->watch);
+	chunk_entry = &chunk->mark;
+
+	spin_lock(&old_entry->lock);
+	if (!old_entry->i.inode) {
+		/* old_entry is being shot, lets just lie */
+		spin_unlock(&old_entry->lock);
+		fsnotify_put_mark(old_entry);
 		free_chunk(chunk);
+		return -ENOENT;
+	}
+
+	fsnotify_duplicate_mark(chunk_entry, old_entry);
+	if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
+		spin_unlock(&old_entry->lock);
+		free_chunk(chunk);
+		fsnotify_put_mark(old_entry);
 		return -ENOSPC;
 	}
+
+	/* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
+	spin_lock(&chunk_entry->lock);
 	spin_lock(&hash_lock);
+
+	/* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
 		chunk->dead = 1;
-		inotify_evict_watch(&chunk->watch);
-		mutex_unlock(&inode->inotify_mutex);
-		put_inotify_watch(&old->watch);
-		put_inotify_watch(&chunk->watch);
+		spin_unlock(&chunk_entry->lock);
+		spin_unlock(&old_entry->lock);
+
+		fsnotify_destroy_mark(chunk_entry);
+
+		fsnotify_put_mark(chunk_entry);
+		fsnotify_put_mark(old_entry);
 		return 0;
 	}
 	list_replace_init(&old->trees, &chunk->trees);
@@ -425,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		list_add(&tree->same_root, &chunk->trees);
 	}
 	spin_unlock(&hash_lock);
-	inotify_evict_watch(&old->watch);
-	mutex_unlock(&inode->inotify_mutex);
-	put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
-	put_inotify_watch(&old->watch); /* and kill it */
+	spin_unlock(&chunk_entry->lock);
+	spin_unlock(&old_entry->lock);
+	fsnotify_destroy_mark(old_entry);
+	fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
+	fsnotify_put_mark(old_entry); /* and kill it */
 	return 0;
 }
 
@@ -548,6 +566,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
 	return 0;
 }
 
+static int compare_root(struct vfsmount *mnt, void *arg)
+{
+	return mnt->mnt_root->d_inode == arg;
+}
+
 void audit_trim_trees(void)
 {
 	struct list_head cursor;
@@ -559,7 +582,6 @@ void audit_trim_trees(void)
 		struct path path;
 		struct vfsmount *root_mnt;
 		struct node *node;
-		struct list_head list;
 		int err;
 
 		tree = container_of(cursor.next, struct audit_tree, list);
@@ -577,24 +599,18 @@ void audit_trim_trees(void)
 		if (!root_mnt)
 			goto skip_it;
 
-		list_add_tail(&list, &root_mnt->mnt_list);
 		spin_lock(&hash_lock);
 		list_for_each_entry(node, &tree->chunks, list) {
 			struct audit_chunk *chunk = find_chunk(node);
-			struct inode *inode = chunk->watch.inode;
-			struct vfsmount *mnt;
+			/* this could be NULL if the watch is dieing else where... */
+			struct inode *inode = chunk->mark.i.inode;
 			node->index |= 1U<<31;
-			list_for_each_entry(mnt, &list, mnt_list) {
-				if (mnt->mnt_root->d_inode == inode) {
-					node->index &= ~(1U<<31);
-					break;
-				}
-			}
+			if (iterate_mounts(compare_root, inode, root_mnt))
+				node->index &= ~(1U<<31);
 		}
 		spin_unlock(&hash_lock);
 		trim_marked(tree);
 		put_tree(tree);
-		list_del_init(&list);
 		drop_collected_mounts(root_mnt);
 skip_it:
 		mutex_lock(&audit_filter_mutex);
@@ -603,22 +619,6 @@ skip_it:
 	mutex_unlock(&audit_filter_mutex);
 }
 
-static int is_under(struct vfsmount *mnt, struct dentry *dentry,
-		    struct path *path)
-{
-	if (mnt != path->mnt) {
-		for (;;) {
-			if (mnt->mnt_parent == mnt)
-				return 0;
-			if (mnt->mnt_parent == path->mnt)
-					break;
-			mnt = mnt->mnt_parent;
-		}
-		dentry = mnt->mnt_mountpoint;
-	}
-	return is_subdir(dentry, path->dentry);
-}
-
 int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
 {
 
@@ -638,13 +638,17 @@ void audit_put_tree(struct audit_tree *tree)
 	put_tree(tree);
 }
 
+static int tag_mount(struct vfsmount *mnt, void *arg)
+{
+	return tag_chunk(mnt->mnt_root->d_inode, arg);
+}
+
 /* called with audit_filter_mutex */
 int audit_add_tree_rule(struct audit_krule *rule)
 {
 	struct audit_tree *seed = rule->tree, *tree;
 	struct path path;
-	struct vfsmount *mnt, *p;
-	struct list_head list;
+	struct vfsmount *mnt;
 	int err;
 
 	list_for_each_entry(tree, &tree_list, list) {
@@ -670,16 +674,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
 		err = -ENOMEM;
 		goto Err;
 	}
-	list_add_tail(&list, &mnt->mnt_list);
 
 	get_tree(tree);
-	list_for_each_entry(p, &list, mnt_list) {
-		err = tag_chunk(p->mnt_root->d_inode, tree);
-		if (err)
-			break;
-	}
-
-	list_del(&list);
+	err = iterate_mounts(tag_mount, tree, mnt);
 	drop_collected_mounts(mnt);
 
 	if (!err) {
@@ -714,31 +711,23 @@ int audit_tag_tree(char *old, char *new)
 {
 	struct list_head cursor, barrier;
 	int failed = 0;
-	struct path path;
+	struct path path1, path2;
 	struct vfsmount *tagged;
-	struct list_head list;
-	struct vfsmount *mnt;
-	struct dentry *dentry;
 	int err;
 
-	err = kern_path(new, 0, &path);
+	err = kern_path(new, 0, &path2);
 	if (err)
 		return err;
-	tagged = collect_mounts(&path);
-	path_put(&path);
+	tagged = collect_mounts(&path2);
+	path_put(&path2);
 	if (!tagged)
 		return -ENOMEM;
 
-	err = kern_path(old, 0, &path);
+	err = kern_path(old, 0, &path1);
 	if (err) {
 		drop_collected_mounts(tagged);
 		return err;
 	}
-	mnt = mntget(path.mnt);
-	dentry = dget(path.dentry);
-	path_put(&path);
-
-	list_add_tail(&list, &tagged->mnt_list);
 
 	mutex_lock(&audit_filter_mutex);
 	list_add(&barrier, &tree_list);
@@ -746,7 +735,7 @@ int audit_tag_tree(char *old, char *new)
 
 	while (cursor.next != &tree_list) {
 		struct audit_tree *tree;
-		struct vfsmount *p;
+		int good_one = 0;
 
 		tree = container_of(cursor.next, struct audit_tree, list);
 		get_tree(tree);
@@ -754,30 +743,19 @@ int audit_tag_tree(char *old, char *new)
 		list_add(&cursor, &tree->list);
 		mutex_unlock(&audit_filter_mutex);
 
-		err = kern_path(tree->pathname, 0, &path);
-		if (err) {
-			put_tree(tree);
-			mutex_lock(&audit_filter_mutex);
-			continue;
+		err = kern_path(tree->pathname, 0, &path2);
+		if (!err) {
+			good_one = path_is_under(&path1, &path2);
+			path_put(&path2);
 		}
 
-		spin_lock(&vfsmount_lock);
-		if (!is_under(mnt, dentry, &path)) {
-			spin_unlock(&vfsmount_lock);
-			path_put(&path);
+		if (!good_one) {
 			put_tree(tree);
 			mutex_lock(&audit_filter_mutex);
 			continue;
 		}
-		spin_unlock(&vfsmount_lock);
-		path_put(&path);
-
-		list_for_each_entry(p, &list, mnt_list) {
-			failed = tag_chunk(p->mnt_root->d_inode, tree);
-			if (failed)
-				break;
-		}
 
+		failed = iterate_mounts(tag_mount, tree, tagged);
 		if (failed) {
 			put_tree(tree);
 			mutex_lock(&audit_filter_mutex);
@@ -818,10 +796,8 @@ int audit_tag_tree(char *old, char *new)
 	}
 	list_del(&barrier);
 	list_del(&cursor);
-	list_del(&list);
 	mutex_unlock(&audit_filter_mutex);
-	dput(dentry);
-	mntput(mnt);
+	path_put(&path1);
 	drop_collected_mounts(tagged);
 	return failed;
 }
@@ -889,7 +865,6 @@ void audit_kill_trees(struct list_head *list)
  *  Here comes the stuff asynchronous to auditctl operations
  */
 
-/* inode->inotify_mutex is locked */
 static void evict_chunk(struct audit_chunk *chunk)
 {
 	struct audit_tree *owner;
@@ -928,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk)
 	mutex_unlock(&audit_filter_mutex);
 }
 
-static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
-                         u32 cookie, const char *dname, struct inode *inode)
+static int audit_tree_handle_event(struct fsnotify_group *group,
+				   struct fsnotify_mark *inode_mark,
+				   struct fsnotify_mark *vfsmonut_mark,
+				   struct fsnotify_event *event)
 {
-	struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
+	BUG();
+	return -EOPNOTSUPP;
+}
 
-	if (mask & IN_IGNORED) {
-		evict_chunk(chunk);
-		put_inotify_watch(watch);
-	}
+static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
+{
+	struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+
+	evict_chunk(chunk);
+	fsnotify_put_mark(entry);
 }
 
-static void destroy_watch(struct inotify_watch *watch)
+static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
+				  struct fsnotify_mark *inode_mark,
+				  struct fsnotify_mark *vfsmount_mark,
+				  __u32 mask, void *data, int data_type)
 {
-	struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
-	call_rcu(&chunk->head, __put_chunk);
+	return false;
 }
 
-static const struct inotify_operations rtree_inotify_ops = {
-	.handle_event	= handle_event,
-	.destroy_watch	= destroy_watch,
+static const struct fsnotify_ops audit_tree_ops = {
+	.handle_event = audit_tree_handle_event,
+	.should_send_event = audit_tree_send_event,
+	.free_group_priv = NULL,
+	.free_event_priv = NULL,
+	.freeing_mark = audit_tree_freeing_mark,
 };
 
 static int __init audit_tree_init(void)
 {
 	int i;
 
-	rtree_ih = inotify_init(&rtree_inotify_ops);
-	if (IS_ERR(rtree_ih))
-		audit_panic("cannot initialize inotify handle for rectree watches");
+	audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
+	if (IS_ERR(audit_tree_group))
+		audit_panic("cannot initialize fsnotify group for rectree watches");
 
 	for (i = 0; i < HASH_SIZE; i++)
 		INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index cc7e87936cb..f0c9b2e7542 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -24,17 +24,18 @@
 #include <linux/kthread.h>
 #include <linux/mutex.h>
 #include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
-#include <linux/inotify.h>
+#include <linux/slab.h>
 #include <linux/security.h>
 #include "audit.h"
 
 /*
  * Reference counting:
  *
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
  * 	event.  Each audit_watch holds a reference to its associated parent.
  *
  * audit_watch: if added to lists, lifetime is from audit_init_watch() to
@@ -50,40 +51,61 @@ struct audit_watch {
 	unsigned long		ino;	/* associated inode number */
 	struct audit_parent	*parent; /* associated parent */
 	struct list_head	wlist;	/* entry in parent->watches list */
-	struct list_head	rules;	/* associated rules */
+	struct list_head	rules;	/* anchor for krule->rlist */
 };
 
 struct audit_parent {
-	struct list_head	ilist;	/* entry in inotify registration list */
-	struct list_head	watches; /* associated watches */
-	struct inotify_watch	wdata;	/* inotify watch data */
-	unsigned		flags;	/* status flags */
+	struct list_head	watches; /* anchor for audit_watch->wlist */
+	struct fsnotify_mark mark; /* fsnotify mark on the inode */
 };
 
-/* Inotify handle. */
-struct inotify_handle *audit_ih;
+/* fsnotify handle. */
+struct fsnotify_group *audit_watch_group;
 
-/*
- * audit_parent status flags:
- *
- * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
- * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
- * we can receive while holding nameidata.
- */
-#define AUDIT_PARENT_INVALID	0x001
+/* fsnotify events we care about. */
+#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+			FS_MOVE_SELF | FS_EVENT_ON_CHILD)
 
-/* Inotify events we care about. */
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+static void audit_free_parent(struct audit_parent *parent)
+{
+	WARN_ON(!list_empty(&parent->watches));
+	kfree(parent);
+}
 
-static void audit_free_parent(struct inotify_watch *i_watch)
+static void audit_watch_free_mark(struct fsnotify_mark *entry)
 {
 	struct audit_parent *parent;
 
-	parent = container_of(i_watch, struct audit_parent, wdata);
-	WARN_ON(!list_empty(&parent->watches));
-	kfree(parent);
+	parent = container_of(entry, struct audit_parent, mark);
+	audit_free_parent(parent);
+}
+
+static void audit_get_parent(struct audit_parent *parent)
+{
+	if (likely(parent))
+		fsnotify_get_mark(&parent->mark);
+}
+
+static void audit_put_parent(struct audit_parent *parent)
+{
+	if (likely(parent))
+		fsnotify_put_mark(&parent->mark);
+}
+
+/*
+ * Find and return the audit_parent on the given inode.  If found a reference
+ * is taken on this parent.
+ */
+static inline struct audit_parent *audit_find_parent(struct inode *inode)
+{
+	struct audit_parent *parent = NULL;
+	struct fsnotify_mark *entry;
+
+	entry = fsnotify_find_inode_mark(audit_watch_group, inode);
+	if (entry)
+		parent = container_of(entry, struct audit_parent, mark);
+
+	return parent;
 }
 
 void audit_get_watch(struct audit_watch *watch)
@@ -104,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch)
 void audit_remove_watch(struct audit_watch *watch)
 {
 	list_del(&watch->wlist);
-	put_inotify_watch(&watch->parent->wdata);
+	audit_put_parent(watch->parent);
 	watch->parent = NULL;
 	audit_put_watch(watch); /* match initial get */
 }
@@ -114,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch)
 	return watch->path;
 }
 
-struct list_head *audit_watch_rules(struct audit_watch *watch)
-{
-	return &watch->rules;
-}
-
-unsigned long audit_watch_inode(struct audit_watch *watch)
+int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
 {
-	return watch->ino;
-}
-
-dev_t audit_watch_dev(struct audit_watch *watch)
-{
-	return watch->dev;
+	return (watch->ino != (unsigned long)-1) &&
+		(watch->ino == ino) &&
+		(watch->dev == dev);
 }
 
 /* Initialize a parent watch entry. */
 static struct audit_parent *audit_init_parent(struct nameidata *ndp)
 {
+	struct inode *inode = ndp->path.dentry->d_inode;
 	struct audit_parent *parent;
-	s32 wd;
+	int ret;
 
 	parent = kzalloc(sizeof(*parent), GFP_KERNEL);
 	if (unlikely(!parent))
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&parent->watches);
-	parent->flags = 0;
-
-	inotify_init_watch(&parent->wdata);
-	/* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
-	get_inotify_watch(&parent->wdata);
-	wd = inotify_add_watch(audit_ih, &parent->wdata,
-			       ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
-	if (wd < 0) {
-		audit_free_parent(&parent->wdata);
-		return ERR_PTR(wd);
+
+	fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
+	parent->mark.mask = AUDIT_FS_WATCH;
+	ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
+	if (ret < 0) {
+		audit_free_parent(parent);
+		return ERR_PTR(ret);
 	}
 
 	return parent;
@@ -178,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
 {
 	struct audit_watch *watch;
 
-	if (!audit_ih)
+	if (!audit_watch_group)
 		return -EOPNOTSUPP;
 
 	if (path[0] != '/' || path[len-1] == '/' ||
@@ -216,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
 
 	new->dev = old->dev;
 	new->ino = old->ino;
-	get_inotify_watch(&old->parent->wdata);
+	audit_get_parent(old->parent);
 	new->parent = old->parent;
 
 out:
@@ -250,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent,
 	struct audit_entry *oentry, *nentry;
 
 	mutex_lock(&audit_filter_mutex);
+	/* Run all of the watches on this parent looking for the one that
+	 * matches the given dname */
 	list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
 		if (audit_compare_dname_path(dname, owatch->path, NULL))
 			continue;
 
 		/* If the update involves invalidating rules, do the inode-based
 		 * filtering now, so we don't omit records. */
-		if (invalidating && current->audit_context)
+		if (invalidating && !audit_dummy_context())
 			audit_filter_inodes(current, current->audit_context);
 
+		/* updating ino will likely change which audit_hash_list we
+		 * are on so we need a new watch for the new list */
 		nwatch = audit_dupe_watch(owatch);
 		if (IS_ERR(nwatch)) {
 			mutex_unlock(&audit_filter_mutex);
@@ -274,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent,
 			list_del(&oentry->rule.rlist);
 			list_del_rcu(&oentry->list);
 
-			nentry = audit_dupe_rule(&oentry->rule, nwatch);
+			nentry = audit_dupe_rule(&oentry->rule);
 			if (IS_ERR(nentry)) {
 				list_del(&oentry->rule.list);
 				audit_panic("error updating watch, removing");
 			} else {
 				int h = audit_hash_ino((u32)ino);
+
+				/*
+				 * nentry->rule.watch == oentry->rule.watch so
+				 * we must drop that reference and set it to our
+				 * new watch.
+				 */
+				audit_put_watch(nentry->rule.watch);
+				audit_get_watch(nwatch);
+				nentry->rule.watch = nwatch;
 				list_add(&nentry->rule.rlist, &nwatch->rules);
 				list_add_rcu(&nentry->list, &audit_inode_hash[h]);
 				list_replace(&oentry->rule.list,
@@ -311,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 	struct audit_entry *e;
 
 	mutex_lock(&audit_filter_mutex);
-	parent->flags |= AUDIT_PARENT_INVALID;
 	list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
 		list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
 			e = container_of(r, struct audit_entry, rule);
@@ -324,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 		audit_remove_watch(w);
 	}
 	mutex_unlock(&audit_filter_mutex);
-}
-
-/* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
-void audit_inotify_unregister(struct list_head *in_list)
-{
-	struct audit_parent *p, *n;
 
-	list_for_each_entry_safe(p, n, in_list, ilist) {
-		list_del(&p->ilist);
-		inotify_rm_watch(audit_ih, &p->wdata);
-		/* the unpin matching the pin in audit_do_del_rule() */
-		unpin_inotify_watch(&p->wdata);
-	}
+	fsnotify_destroy_mark(&parent->mark);
 }
 
 /* Get path information necessary for adding watches. */
@@ -388,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
 	}
 }
 
-/* Associate the given rule with an existing parent inotify_watch.
+/* Associate the given rule with an existing parent.
  * Caller must hold audit_filter_mutex. */
 static void audit_add_to_parent(struct audit_krule *krule,
 				struct audit_parent *parent)
@@ -396,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule,
 	struct audit_watch *w, *watch = krule->watch;
 	int watch_found = 0;
 
+	BUG_ON(!mutex_is_locked(&audit_filter_mutex));
+
 	list_for_each_entry(w, &parent->watches, wlist) {
 		if (strcmp(watch->path, w->path))
 			continue;
@@ -412,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule,
 	}
 
 	if (!watch_found) {
-		get_inotify_watch(&parent->wdata);
+		audit_get_parent(parent);
 		watch->parent = parent;
 
 		list_add(&watch->wlist, &parent->watches);
@@ -422,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule,
 
 /* Find a matching watch entry, or add this one.
  * Caller must hold audit_filter_mutex. */
-int audit_add_watch(struct audit_krule *krule)
+int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 {
 	struct audit_watch *watch = krule->watch;
-	struct inotify_watch *i_watch;
 	struct audit_parent *parent;
 	struct nameidata *ndp = NULL, *ndw = NULL;
-	int ret = 0;
+	int h, ret = 0;
 
 	mutex_unlock(&audit_filter_mutex);
 
@@ -440,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule)
 		goto error;
 	}
 
+	mutex_lock(&audit_filter_mutex);
+
 	/* update watch filter fields */
 	if (ndw) {
 		watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
 		watch->ino = ndw->path.dentry->d_inode->i_ino;
 	}
 
-	/* The audit_filter_mutex must not be held during inotify calls because
-	 * we hold it during inotify event callback processing.  If an existing
-	 * inotify watch is found, inotify_find_watch() grabs a reference before
-	 * returning.
-	 */
-	if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
-			       &i_watch) < 0) {
+	/* either find an old parent or attach a new one */
+	parent = audit_find_parent(ndp->path.dentry->d_inode);
+	if (!parent) {
 		parent = audit_init_parent(ndp);
 		if (IS_ERR(parent)) {
-			/* caller expects mutex locked */
-			mutex_lock(&audit_filter_mutex);
 			ret = PTR_ERR(parent);
 			goto error;
 		}
-	} else
-		parent = container_of(i_watch, struct audit_parent, wdata);
-
-	mutex_lock(&audit_filter_mutex);
+	}
 
-	/* parent was moved before we took audit_filter_mutex */
-	if (parent->flags & AUDIT_PARENT_INVALID)
-		ret = -ENOENT;
-	else
-		audit_add_to_parent(krule, parent);
+	audit_add_to_parent(krule, parent);
 
-	/* match get in audit_init_parent or inotify_find_watch */
-	put_inotify_watch(&parent->wdata);
+	/* match get in audit_find_parent or audit_init_parent */
+	audit_put_parent(parent);
 
+	h = audit_hash_ino((u32)watch->ino);
+	*list = &audit_inode_hash[h];
 error:
 	audit_put_nd(ndp, ndw);		/* NULL args OK */
 	return ret;
 
 }
 
-void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
+void audit_remove_watch_rule(struct audit_krule *krule)
 {
 	struct audit_watch *watch = krule->watch;
 	struct audit_parent *parent = watch->parent;
@@ -491,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
 		audit_remove_watch(watch);
 
 		if (list_empty(&parent->watches)) {
-			/* Put parent on the inotify un-registration
-			 * list.  Grab a reference before releasing
-			 * audit_filter_mutex, to be released in
-			 * audit_inotify_unregister().
-			 * If filesystem is going away, just leave
-			 * the sucker alone, eviction will take
-			 * care of it. */
-			if (pin_inotify_watch(&parent->wdata))
-				list_add(&parent->ilist, list);
+			audit_get_parent(parent);
+			fsnotify_destroy_mark(&parent->mark);
+			audit_put_parent(parent);
 		}
 	}
 }
 
-/* Update watch data in audit rules based on inotify events. */
-static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
-			 u32 cookie, const char *dname, struct inode *inode)
+static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
+					  struct fsnotify_mark *inode_mark,
+					  struct fsnotify_mark *vfsmount_mark,
+					  __u32 mask, void *data, int data_type)
+{
+       return true;
+}
+
+/* Update watch data in audit rules based on fsnotify events. */
+static int audit_watch_handle_event(struct fsnotify_group *group,
+				    struct fsnotify_mark *inode_mark,
+				    struct fsnotify_mark *vfsmount_mark,
+				    struct fsnotify_event *event)
 {
+	struct inode *inode;
+	__u32 mask = event->mask;
+	const char *dname = event->file_name;
 	struct audit_parent *parent;
 
-	parent = container_of(i_watch, struct audit_parent, wdata);
+	parent = container_of(inode_mark, struct audit_parent, mark);
 
-	if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
-		audit_update_watch(parent, dname, inode->i_sb->s_dev,
-				   inode->i_ino, 0);
-	else if (mask & (IN_DELETE|IN_MOVED_FROM))
+	BUG_ON(group != audit_watch_group);
+
+	switch (event->data_type) {
+	case (FSNOTIFY_EVENT_PATH):
+		inode = event->path.dentry->d_inode;
+		break;
+	case (FSNOTIFY_EVENT_INODE):
+		inode = event->inode;
+		break;
+	default:
+		BUG();
+		inode = NULL;
+		break;
+	};
+
+	if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
+		audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
+	else if (mask & (FS_DELETE|FS_MOVED_FROM))
 		audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
-	/* inotify automatically removes the watch and sends IN_IGNORED */
-	else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
-		audit_remove_parent_watches(parent);
-	/* inotify does not remove the watch, so remove it manually */
-	else if(mask & IN_MOVE_SELF) {
+	else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
 		audit_remove_parent_watches(parent);
-		inotify_remove_watch_locked(audit_ih, i_watch);
-	} else if (mask & IN_IGNORED)
-		put_inotify_watch(i_watch);
+
+	return 0;
 }
 
-static const struct inotify_operations audit_inotify_ops = {
-	.handle_event   = audit_handle_ievent,
-	.destroy_watch  = audit_free_parent,
+static const struct fsnotify_ops audit_watch_fsnotify_ops = {
+	.should_send_event = 	audit_watch_should_send_event,
+	.handle_event = 	audit_watch_handle_event,
+	.free_group_priv = 	NULL,
+	.freeing_mark = 	NULL,
+	.free_event_priv = 	NULL,
 };
 
 static int __init audit_watch_init(void)
 {
-	audit_ih = inotify_init(&audit_inotify_ops);
-	if (IS_ERR(audit_ih))
-		audit_panic("cannot initialize inotify handle");
+	audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
+	if (IS_ERR(audit_watch_group)) {
+		audit_watch_group = NULL;
+		audit_panic("cannot create audit fsnotify group");
+	}
 	return 0;
 }
-subsys_initcall(audit_watch_init);
+device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a70604047f3..eb7675499fb 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/security.h>
 #include "audit.h"
 
@@ -70,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e)
 {
 	int i;
 	struct audit_krule *erule = &e->rule;
+
 	/* some rules don't have associated watches */
 	if (erule->watch)
 		audit_put_watch(erule->watch);
@@ -745,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
  * rule with the new rule in the filterlist, then free the old rule.
  * The rlist element is undefined; list manipulations are handled apart from
  * the initial copy. */
-struct audit_entry *audit_dupe_rule(struct audit_krule *old,
-				    struct audit_watch *watch)
+struct audit_entry *audit_dupe_rule(struct audit_krule *old)
 {
 	u32 fcount = old->field_count;
 	struct audit_entry *entry;
@@ -768,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
 	new->prio = old->prio;
 	new->buflen = old->buflen;
 	new->inode_f = old->inode_f;
-	new->watch = NULL;
 	new->field_count = old->field_count;
+
 	/*
 	 * note that we are OK with not refcounting here; audit_match_tree()
 	 * never dereferences tree and we can't get false positives there
@@ -810,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
 		}
 	}
 
-	if (watch) {
-		audit_get_watch(watch);
-		new->watch = watch;
+	if (old->watch) {
+		audit_get_watch(old->watch);
+		new->watch = old->watch;
 	}
 
 	return entry;
@@ -865,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
 	struct audit_watch *watch = entry->rule.watch;
 	struct audit_tree *tree = entry->rule.tree;
 	struct list_head *list;
-	int h, err;
+	int err;
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
 
@@ -888,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry)
 
 	if (watch) {
 		/* audit_filter_mutex is dropped and re-taken during this call */
-		err = audit_add_watch(&entry->rule);
+		err = audit_add_watch(&entry->rule, &list);
 		if (err) {
 			mutex_unlock(&audit_filter_mutex);
 			goto error;
 		}
-		/* entry->rule.watch may have changed during audit_add_watch() */
-		watch = entry->rule.watch;
-		h = audit_hash_ino((u32)audit_watch_inode(watch));
-		list = &audit_inode_hash[h];
 	}
 	if (tree) {
 		err = audit_add_tree_rule(&entry->rule);
@@ -948,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
 	struct audit_watch *watch = entry->rule.watch;
 	struct audit_tree *tree = entry->rule.tree;
 	struct list_head *list;
-	LIST_HEAD(inotify_list);
 	int ret = 0;
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
@@ -968,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry)
 	}
 
 	if (e->rule.watch)
-		audit_remove_watch_rule(&e->rule, &inotify_list);
+		audit_remove_watch_rule(&e->rule);
 
 	if (e->rule.tree)
 		audit_remove_tree_rule(&e->rule);
@@ -986,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
 #endif
 	mutex_unlock(&audit_filter_mutex);
 
-	if (!list_empty(&inotify_list))
-		audit_inotify_unregister(&inotify_list);
-
 out:
 	if (watch)
 		audit_put_watch(watch); /* match initial get */
@@ -1322,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r)
 {
 	struct audit_entry *entry = container_of(r, struct audit_entry, rule);
 	struct audit_entry *nentry;
-	struct audit_watch *watch;
-	struct audit_tree *tree;
 	int err = 0;
 
 	if (!security_audit_rule_known(r))
 		return 0;
 
-	watch = r->watch;
-	tree = r->tree;
-	nentry = audit_dupe_rule(r, watch);
+	nentry = audit_dupe_rule(r);
 	if (IS_ERR(nentry)) {
 		/* save the first error encountered for the
 		 * return value */
 		err = PTR_ERR(nentry);
 		audit_panic("error updating LSM filters");
-		if (watch)
+		if (r->watch)
 			list_del(&r->rlist);
 		list_del_rcu(&entry->list);
 		list_del(&r->list);
 	} else {
-		if (watch) {
-			list_add(&nentry->rule.rlist, audit_watch_rules(watch));
-			list_del(&r->rlist);
-		} else if (tree)
+		if (r->watch || r->tree)
 			list_replace_init(&r->rlist, &nentry->rule.rlist);
 		list_replace_rcu(&entry->list, &nentry->list);
 		list_replace(&r->list, &nentry->rule.list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e..1b31c130d03 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -49,6 +49,7 @@
 #include <linux/namei.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/socket.h>
 #include <linux/mqueue.h>
@@ -64,7 +65,6 @@
 #include <linux/binfmts.h>
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
-#include <linux/inotify.h>
 #include <linux/capability.h>
 #include <linux/fs_struct.h>
 
@@ -548,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_WATCH:
-			if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
-				result = (name->dev == audit_watch_dev(rule->watch) &&
-					  name->ino == audit_watch_inode(rule->watch));
+			if (name)
+				result = audit_watch_compare(rule->watch, name->ino, name->dev);
 			break;
 		case AUDIT_DIR:
 			if (ctx)
@@ -1725,7 +1724,7 @@ static inline void handle_one(const struct inode *inode)
 	struct audit_tree_refs *p;
 	struct audit_chunk *chunk;
 	int count;
-	if (likely(list_empty(&inode->inotify_watches)))
+	if (likely(hlist_empty(&inode->i_fsnotify_marks)))
 		return;
 	context = current->audit_context;
 	p = context->trees;
@@ -1768,7 +1767,7 @@ retry:
 	seq = read_seqbegin(&rename_lock);
 	for(;;) {
 		struct inode *inode = d->d_inode;
-		if (inode && unlikely(!list_empty(&inode->inotify_watches))) {
+		if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
 			struct audit_chunk *chunk;
 			chunk = audit_tree_lookup(inode);
 			if (chunk) {
@@ -1836,13 +1835,8 @@ void __audit_getname(const char *name)
 	context->names[context->name_count].ino  = (unsigned long)-1;
 	context->names[context->name_count].osid = 0;
 	++context->name_count;
-	if (!context->pwd.dentry) {
-		read_lock(&current->fs->lock);
-		context->pwd = current->fs->pwd;
-		path_get(&current->fs->pwd);
-		read_unlock(&current->fs->lock);
-	}
-
+	if (!context->pwd.dentry)
+		get_fs_pwd(current->fs, &context->pwd);
 }
 
 /* audit_putname - intercept a putname request
@@ -1893,7 +1887,7 @@ static int audit_inc_name_count(struct audit_context *context,
 {
 	if (context->name_count >= AUDIT_NAMES) {
 		if (inode)
-			printk(KERN_DEBUG "name_count maxed, losing inode data: "
+			printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
 			       "dev=%02x:%02x, inode=%lu\n",
 			       MAJOR(inode->i_sb->s_dev),
 			       MINOR(inode->i_sb->s_dev),
@@ -1988,7 +1982,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 
 /**
  * audit_inode_child - collect inode info for created/removed objects
- * @dname: inode's dentry name
  * @dentry: dentry being audited
  * @parent: inode of dentry parent
  *
@@ -2000,13 +1993,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
  * must be hooked prior, in order to capture the target inode during
  * unsuccessful attempts.
  */
-void __audit_inode_child(const char *dname, const struct dentry *dentry,
+void __audit_inode_child(const struct dentry *dentry,
 			 const struct inode *parent)
 {
 	int idx;
 	struct audit_context *context = current->audit_context;
 	const char *found_parent = NULL, *found_child = NULL;
 	const struct inode *inode = dentry->d_inode;
+	const char *dname = dentry->d_name.name;
 	int dirlen = 0;
 
 	if (!context->in_syscall)
@@ -2014,9 +2008,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
 
 	if (inode)
 		handle_one(inode);
-	/* determine matching parent */
-	if (!dname)
-		goto add_names;
 
 	/* parent is more likely, look for it first */
 	for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521..2f05303715a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
-#include "cred-internals.h"
 
 /*
  * Leveraged for setting/resetting capabilities
@@ -135,7 +134,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 	if (pid && (pid != task_pid_vnr(current))) {
 		struct task_struct *target;
 
-		read_lock(&tasklist_lock);
+		rcu_read_lock();
 
 		target = find_task_by_vpid(pid);
 		if (!target)
@@ -143,7 +142,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 		else
 			ret = security_capget(target, pEp, pIp, pPp);
 
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 	} else
 		ret = security_capget(current, pEp, pIp, pPp);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1fbcc748044..c9483d8f614 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
  *  Based originally on the cpuset system, extracted by Paul Menage
  *  Copyright (C) 2006 Google, Inc
  *
+ *  Notifications support
+ *  Copyright (C) 2009 Nokia Corporation
+ *  Author: Kirill A. Shutemov
+ *
  *  Copyright notices from the original cpuset code:
  *  --------------------------------------------------
  *  Copyright (C) 2003 BULL SA.
@@ -43,6 +47,7 @@
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
+#include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
@@ -51,15 +56,21 @@
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/eventfd.h>
+#include <linux/poll.h>
 
 #include <asm/atomic.h>
 
 static DEFINE_MUTEX(cgroup_mutex);
 
-/* Generate an array of cgroup subsystem pointers */
+/*
+ * Generate an array of cgroup subsystem pointers. At boot time, this is
+ * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
+ * registered after that. The mutable section of this array is protected by
+ * cgroup_mutex.
+ */
 #define SUBSYS(_x) &_x ## _subsys,
-
-static struct cgroup_subsys *subsys[] = {
+static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
 #include <linux/cgroup_subsys.h>
 };
 
@@ -146,6 +157,35 @@ struct css_id {
 	unsigned short stack[0]; /* Array of Length (depth+1) */
 };
 
+/*
+ * cgroup_event represents events which userspace want to recieve.
+ */
+struct cgroup_event {
+	/*
+	 * Cgroup which the event belongs to.
+	 */
+	struct cgroup *cgrp;
+	/*
+	 * Control file which the event associated.
+	 */
+	struct cftype *cft;
+	/*
+	 * eventfd to signal userspace about the event.
+	 */
+	struct eventfd_ctx *eventfd;
+	/*
+	 * Each of these stored in a list by the cgroup.
+	 */
+	struct list_head list;
+	/*
+	 * All fields below needed to unregister event when
+	 * userspace closes eventfd.
+	 */
+	poll_table pt;
+	wait_queue_head_t *wqh;
+	wait_queue_t wait;
+	struct work_struct remove;
+};
 
 /* The list of hierarchy roots */
 
@@ -166,6 +206,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
  */
 static int need_forkexit_callback __read_mostly;
 
+#ifdef CONFIG_PROVE_LOCKING
+int cgroup_lock_is_held(void)
+{
+	return lockdep_is_held(&cgroup_mutex);
+}
+#else /* #ifdef CONFIG_PROVE_LOCKING */
+int cgroup_lock_is_held(void)
+{
+	return mutex_is_locked(&cgroup_mutex);
+}
+#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
+
+EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
+
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -235,7 +289,8 @@ struct cg_cgroup_link {
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
 
-static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+static int cgroup_init_idr(struct cgroup_subsys *ss,
+			   struct cgroup_subsys_state *css);
 
 /* css_set_lock protects the list of css_set objects, and the
  * chain of tasks off each css_set.  Nests outside task->alloc_lock
@@ -433,8 +488,11 @@ static struct css_set *find_existing_css_set(
 	struct hlist_node *node;
 	struct css_set *cg;
 
-	/* Built the set of subsystem state objects that we want to
-	 * see in the new css_set */
+	/*
+	 * Build the set of subsystem state objects that we want to see in the
+	 * new css_set. while subsystems can change globally, the entries here
+	 * won't change, so no need for locking.
+	 */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		if (root->subsys_bits & (1UL << i)) {
 			/* Subsystem is in this hierarchy. So we want
@@ -681,6 +739,7 @@ void cgroup_lock(void)
 {
 	mutex_lock(&cgroup_mutex);
 }
+EXPORT_SYMBOL_GPL(cgroup_lock);
 
 /**
  * cgroup_unlock - release lock on cgroup changes
@@ -691,6 +750,7 @@ void cgroup_unlock(void)
 {
 	mutex_unlock(&cgroup_mutex);
 }
+EXPORT_SYMBOL_GPL(cgroup_unlock);
 
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
@@ -742,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 			if (ret)
 				break;
 		}
+
 	return ret;
 }
 
@@ -869,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
 	css_put(css);
 }
 
-
+/*
+ * Call with cgroup_mutex held. Drops reference counts on modules, including
+ * any duplicate ones that parse_cgroupfs_options took. If this function
+ * returns an error, no reference counts are touched.
+ */
 static int rebind_subsystems(struct cgroupfs_root *root,
 			      unsigned long final_bits)
 {
@@ -877,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 	struct cgroup *cgrp = &root->top_cgroup;
 	int i;
 
+	BUG_ON(!mutex_is_locked(&cgroup_mutex));
+
 	removed_bits = root->actual_subsys_bits & ~final_bits;
 	added_bits = final_bits & ~root->actual_subsys_bits;
 	/* Check that any added subsystems are currently free */
@@ -885,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 		struct cgroup_subsys *ss = subsys[i];
 		if (!(bit & added_bits))
 			continue;
+		/*
+		 * Nobody should tell us to do a subsys that doesn't exist:
+		 * parse_cgroupfs_options should catch that case and refcounts
+		 * ensure that subsystems won't disappear once selected.
+		 */
+		BUG_ON(ss == NULL);
 		if (ss->root != &rootnode) {
 			/* Subsystem isn't free */
 			return -EBUSY;
@@ -904,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 		unsigned long bit = 1UL << i;
 		if (bit & added_bits) {
 			/* We're binding this subsystem to this hierarchy */
+			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i]);
 			BUG_ON(!dummytop->subsys[i]);
 			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -915,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			if (ss->bind)
 				ss->bind(ss, cgrp);
 			mutex_unlock(&ss->hierarchy_mutex);
+			/* refcount was already taken, and we're keeping it */
 		} else if (bit & removed_bits) {
 			/* We're removing this subsystem */
+			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
 			mutex_lock(&ss->hierarchy_mutex);
@@ -927,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			subsys[i]->root = &rootnode;
 			list_move(&ss->sibling, &rootnode.subsys_list);
 			mutex_unlock(&ss->hierarchy_mutex);
+			/* subsystem is now free - drop reference on module */
+			module_put(ss->module);
 		} else if (bit & final_bits) {
 			/* Subsystem state should already exist */
+			BUG_ON(ss == NULL);
 			BUG_ON(!cgrp->subsys[i]);
+			/*
+			 * a refcount was taken, but we already had one, so
+			 * drop the extra reference.
+			 */
+			module_put(ss->module);
+#ifdef CONFIG_MODULE_UNLOAD
+			BUG_ON(ss->module && !module_refcount(ss->module));
+#endif
 		} else {
 			/* Subsystem state shouldn't exist */
 			BUG_ON(cgrp->subsys[i]);
@@ -971,13 +1058,20 @@ struct cgroup_sb_opts {
 
 };
 
-/* Convert a hierarchy specifier into a bitmask of subsystems and
- * flags. */
-static int parse_cgroupfs_options(char *data,
-				     struct cgroup_sb_opts *opts)
+/*
+ * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
+ * with cgroup_mutex held to protect the subsys[] array. This function takes
+ * refcounts on subsystems to be used, unless it returns error, in which case
+ * no refcounts are taken.
+ */
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
 	char *token, *o = data ?: "all";
 	unsigned long mask = (unsigned long)-1;
+	int i;
+	bool module_pin_failed = false;
+
+	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 
 #ifdef CONFIG_CPUSETS
 	mask = ~(1UL << cpuset_subsys_id);
@@ -990,10 +1084,11 @@ static int parse_cgroupfs_options(char *data,
 			return -EINVAL;
 		if (!strcmp(token, "all")) {
 			/* Add all non-disabled subsystems */
-			int i;
 			opts->subsys_bits = 0;
 			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 				struct cgroup_subsys *ss = subsys[i];
+				if (ss == NULL)
+					continue;
 				if (!ss->disabled)
 					opts->subsys_bits |= 1ul << i;
 			}
@@ -1007,11 +1102,10 @@ static int parse_cgroupfs_options(char *data,
 			if (opts->release_agent)
 				return -EINVAL;
 			opts->release_agent =
-				kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
+				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
 			if (!opts->release_agent)
 				return -ENOMEM;
 		} else if (!strncmp(token, "name=", 5)) {
-			int i;
 			const char *name = token + 5;
 			/* Can't specify an empty name */
 			if (!strlen(name))
@@ -1029,15 +1123,16 @@ static int parse_cgroupfs_options(char *data,
 			if (opts->name)
 				return -EINVAL;
 			opts->name = kstrndup(name,
-					      MAX_CGROUP_ROOT_NAMELEN,
+					      MAX_CGROUP_ROOT_NAMELEN - 1,
 					      GFP_KERNEL);
 			if (!opts->name)
 				return -ENOMEM;
 		} else {
 			struct cgroup_subsys *ss;
-			int i;
 			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 				ss = subsys[i];
+				if (ss == NULL)
+					continue;
 				if (!strcmp(token, ss->name)) {
 					if (!ss->disabled)
 						set_bit(i, &opts->subsys_bits);
@@ -1072,9 +1167,54 @@ static int parse_cgroupfs_options(char *data,
 	if (!opts->subsys_bits && !opts->name)
 		return -EINVAL;
 
+	/*
+	 * Grab references on all the modules we'll need, so the subsystems
+	 * don't dance around before rebind_subsystems attaches them. This may
+	 * take duplicate reference counts on a subsystem that's already used,
+	 * but rebind_subsystems handles this case.
+	 */
+	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+		unsigned long bit = 1UL << i;
+
+		if (!(bit & opts->subsys_bits))
+			continue;
+		if (!try_module_get(subsys[i]->module)) {
+			module_pin_failed = true;
+			break;
+		}
+	}
+	if (module_pin_failed) {
+		/*
+		 * oops, one of the modules was going away. this means that we
+		 * raced with a module_delete call, and to the user this is
+		 * essentially a "subsystem doesn't exist" case.
+		 */
+		for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
+			/* drop refcounts only on the ones we took */
+			unsigned long bit = 1UL << i;
+
+			if (!(bit & opts->subsys_bits))
+				continue;
+			module_put(subsys[i]->module);
+		}
+		return -ENOENT;
+	}
+
 	return 0;
 }
 
+static void drop_parsed_module_refcounts(unsigned long subsys_bits)
+{
+	int i;
+	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+		unsigned long bit = 1UL << i;
+
+		if (!(bit & subsys_bits))
+			continue;
+		module_put(subsys[i]->module);
+	}
+}
+
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 {
 	int ret = 0;
@@ -1091,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
 
-	/* Don't allow flags to change at remount */
-	if (opts.flags != root->flags) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* Don't allow name to change at remount */
-	if (opts.name && strcmp(opts.name, root->name)) {
+	/* Don't allow flags or name to change at remount */
+	if (opts.flags != root->flags ||
+	    (opts.name && strcmp(opts.name, root->name))) {
 		ret = -EINVAL;
+		drop_parsed_module_refcounts(opts.subsys_bits);
 		goto out_unlock;
 	}
 
 	ret = rebind_subsystems(root, opts.subsys_bits);
-	if (ret)
+	if (ret) {
+		drop_parsed_module_refcounts(opts.subsys_bits);
 		goto out_unlock;
+	}
 
 	/* (re)populate subsystem files */
 	cgroup_populate_dir(cgrp);
@@ -1136,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
+	INIT_LIST_HEAD(&cgrp->event_list);
+	spin_lock_init(&cgrp->event_list_lock);
 }
 
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1291,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 	struct cgroupfs_root *new_root;
 
 	/* First find the desired set of subsystems */
+	mutex_lock(&cgroup_mutex);
 	ret = parse_cgroupfs_options(data, &opts);
+	mutex_unlock(&cgroup_mutex);
 	if (ret)
 		goto out_err;
 
@@ -1302,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 	new_root = cgroup_root_from_opts(&opts);
 	if (IS_ERR(new_root)) {
 		ret = PTR_ERR(new_root);
-		goto out_err;
+		goto drop_modules;
 	}
 	opts.new_root = new_root;
 
@@ -1311,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 	if (IS_ERR(sb)) {
 		ret = PTR_ERR(sb);
 		cgroup_drop_root(opts.new_root);
-		goto out_err;
+		goto drop_modules;
 	}
 
 	root = sb->s_fs_info;
@@ -1367,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 			free_cg_links(&tmp_cg_links);
 			goto drop_new_super;
 		}
+		/*
+		 * There must be no failure case after here, since rebinding
+		 * takes care of subsystems' refcounts, which are explicitly
+		 * dropped in the failure exit path.
+		 */
 
 		/* EBUSY should be the only error here */
 		BUG_ON(ret);
@@ -1405,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 		 * any) is not needed
 		 */
 		cgroup_drop_root(opts.new_root);
+		/* no subsys rebinding, so refcounts don't change */
+		drop_parsed_module_refcounts(opts.subsys_bits);
 	}
 
 	simple_set_mnt(mnt, sb);
@@ -1414,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 
  drop_new_super:
 	deactivate_locked_super(sb);
+ drop_modules:
+	drop_parsed_module_refcounts(opts.subsys_bits);
  out_err:
 	kfree(opts.release_agent);
 	kfree(opts.name);
@@ -1472,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = {
 	.kill_sb = cgroup_kill_sb,
 };
 
+static struct kobject *cgroup_kobj;
+
 static inline struct cgroup *__d_cgrp(struct dentry *dentry)
 {
 	return dentry->d_fsdata;
@@ -1495,7 +1648,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
 	char *start;
-	struct dentry *dentry = rcu_dereference(cgrp->dentry);
+	struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
+						      rcu_read_lock_held() ||
+						      cgroup_lock_is_held());
 
 	if (!dentry || cgrp == dummytop) {
 		/*
@@ -1511,13 +1666,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 	*--start = '\0';
 	for (;;) {
 		int len = dentry->d_name.len;
+
 		if ((start -= len) < buf)
 			return -ENAMETOOLONG;
-		memcpy(start, cgrp->dentry->d_name.name, len);
+		memcpy(start, dentry->d_name.name, len);
 		cgrp = cgrp->parent;
 		if (!cgrp)
 			break;
-		dentry = rcu_dereference(cgrp->dentry);
+
+		dentry = rcu_dereference_check(cgrp->dentry,
+					       rcu_read_lock_held() ||
+					       cgroup_lock_is_held());
 		if (!cgrp->parent)
 			continue;
 		if (--start < buf)
@@ -1527,6 +1686,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 	memmove(buf, start, buf + buflen - start);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_path);
 
 /**
  * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1539,7 +1699,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 	int retval = 0;
-	struct cgroup_subsys *ss;
+	struct cgroup_subsys *ss, *failed_ss = NULL;
 	struct cgroup *oldcgrp;
 	struct css_set *cg;
 	struct css_set *newcg;
@@ -1553,8 +1713,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	for_each_subsys(root, ss) {
 		if (ss->can_attach) {
 			retval = ss->can_attach(ss, cgrp, tsk, false);
-			if (retval)
-				return retval;
+			if (retval) {
+				/*
+				 * Remember on which subsystem the can_attach()
+				 * failed, so that we only call cancel_attach()
+				 * against the subsystems whose can_attach()
+				 * succeeded. (See below)
+				 */
+				failed_ss = ss;
+				goto out;
+			}
 		}
 	}
 
@@ -1568,14 +1736,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	 */
 	newcg = find_css_set(cg, cgrp);
 	put_css_set(cg);
-	if (!newcg)
-		return -ENOMEM;
+	if (!newcg) {
+		retval = -ENOMEM;
+		goto out;
+	}
 
 	task_lock(tsk);
 	if (tsk->flags & PF_EXITING) {
 		task_unlock(tsk);
 		put_css_set(newcg);
-		return -ESRCH;
+		retval = -ESRCH;
+		goto out;
 	}
 	rcu_assign_pointer(tsk->cgroups, newcg);
 	task_unlock(tsk);
@@ -1601,8 +1772,47 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	 * is no longer empty.
 	 */
 	cgroup_wakeup_rmdir_waiter(cgrp);
-	return 0;
+out:
+	if (retval) {
+		for_each_subsys(root, ss) {
+			if (ss == failed_ss)
+				/*
+				 * This subsystem was the one that failed the
+				 * can_attach() check earlier, so we don't need
+				 * to call cancel_attach() against it or any
+				 * remaining subsystems.
+				 */
+				break;
+			if (ss->cancel_attach)
+				ss->cancel_attach(ss, cgrp, tsk, false);
+		}
+	}
+	return retval;
+}
+
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+	struct cgroupfs_root *root;
+	int retval = 0;
+
+	cgroup_lock();
+	for_each_active_root(root) {
+		struct cgroup *from_cg = task_cgroup_from_root(from, root);
+
+		retval = cgroup_attach_task(from_cg, tsk);
+		if (retval)
+			break;
+	}
+	cgroup_unlock();
+
+	return retval;
 }
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 
 /*
  * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
@@ -1667,6 +1877,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
 	}
 	return true;
 }
+EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
 
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 				      const char *buffer)
@@ -1935,6 +2146,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
 	.rename = cgroup_rename,
 };
 
+/*
+ * Check if a file is a control file
+ */
+static inline struct cftype *__file_cft(struct file *file)
+{
+	if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+		return ERR_PTR(-EINVAL);
+	return __d_cft(file->f_dentry);
+}
+
 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
 				struct super_block *sb)
 {
@@ -2054,6 +2275,7 @@ int cgroup_add_file(struct cgroup *cgrp,
 		error = PTR_ERR(dentry);
 	return error;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_file);
 
 int cgroup_add_files(struct cgroup *cgrp,
 			struct cgroup_subsys *subsys,
@@ -2068,6 +2290,7 @@ int cgroup_add_files(struct cgroup *cgrp,
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_files);
 
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2453,7 +2676,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 {
 	struct cgroup_pidlist *l;
 	/* don't need task_nsproxy() if we're looking at ourself */
-	struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+	struct pid_namespace *ns = current->nsproxy->pid_ns;
+
 	/*
 	 * We can't drop the pidlist_mutex before taking the l->mutex in case
 	 * the last ref-holder is trying to remove l from the list at the same
@@ -2463,8 +2687,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 	mutex_lock(&cgrp->pidlist_mutex);
 	list_for_each_entry(l, &cgrp->pidlists, links) {
 		if (l->key.type == type && l->key.ns == ns) {
-			/* found a matching list - drop the extra refcount */
-			put_pid_ns(ns);
 			/* make sure l doesn't vanish out from under us */
 			down_write(&l->mutex);
 			mutex_unlock(&cgrp->pidlist_mutex);
@@ -2475,13 +2697,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 	l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
 	if (!l) {
 		mutex_unlock(&cgrp->pidlist_mutex);
-		put_pid_ns(ns);
 		return l;
 	}
 	init_rwsem(&l->mutex);
 	down_write(&l->mutex);
 	l->key.type = type;
-	l->key.ns = ns;
+	l->key.ns = get_pid_ns(ns);
 	l->use_count = 0; /* don't increment here */
 	l->list = NULL;
 	l->owner = cgrp;
@@ -2789,6 +3010,173 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 }
 
 /*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void cgroup_event_remove(struct work_struct *work)
+{
+	struct cgroup_event *event = container_of(work, struct cgroup_event,
+			remove);
+	struct cgroup *cgrp = event->cgrp;
+
+	event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+
+	eventfd_ctx_put(event->eventfd);
+	kfree(event);
+	dput(cgrp->dentry);
+}
+
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
+		int sync, void *key)
+{
+	struct cgroup_event *event = container_of(wait,
+			struct cgroup_event, wait);
+	struct cgroup *cgrp = event->cgrp;
+	unsigned long flags = (unsigned long)key;
+
+	if (flags & POLLHUP) {
+		__remove_wait_queue(event->wqh, &event->wait);
+		spin_lock(&cgrp->event_list_lock);
+		list_del(&event->list);
+		spin_unlock(&cgrp->event_list_lock);
+		/*
+		 * We are in atomic context, but cgroup_event_remove() may
+		 * sleep, so we have to call it in workqueue.
+		 */
+		schedule_work(&event->remove);
+	}
+
+	return 0;
+}
+
+static void cgroup_event_ptable_queue_proc(struct file *file,
+		wait_queue_head_t *wqh, poll_table *pt)
+{
+	struct cgroup_event *event = container_of(pt,
+			struct cgroup_event, pt);
+
+	event->wqh = wqh;
+	add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
+				      const char *buffer)
+{
+	struct cgroup_event *event = NULL;
+	unsigned int efd, cfd;
+	struct file *efile = NULL;
+	struct file *cfile = NULL;
+	char *endp;
+	int ret;
+
+	efd = simple_strtoul(buffer, &endp, 10);
+	if (*endp != ' ')
+		return -EINVAL;
+	buffer = endp + 1;
+
+	cfd = simple_strtoul(buffer, &endp, 10);
+	if ((*endp != ' ') && (*endp != '\0'))
+		return -EINVAL;
+	buffer = endp + 1;
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (!event)
+		return -ENOMEM;
+	event->cgrp = cgrp;
+	INIT_LIST_HEAD(&event->list);
+	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
+	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
+	INIT_WORK(&event->remove, cgroup_event_remove);
+
+	efile = eventfd_fget(efd);
+	if (IS_ERR(efile)) {
+		ret = PTR_ERR(efile);
+		goto fail;
+	}
+
+	event->eventfd = eventfd_ctx_fileget(efile);
+	if (IS_ERR(event->eventfd)) {
+		ret = PTR_ERR(event->eventfd);
+		goto fail;
+	}
+
+	cfile = fget(cfd);
+	if (!cfile) {
+		ret = -EBADF;
+		goto fail;
+	}
+
+	/* the process need read permission on control file */
+	ret = file_permission(cfile, MAY_READ);
+	if (ret < 0)
+		goto fail;
+
+	event->cft = __file_cft(cfile);
+	if (IS_ERR(event->cft)) {
+		ret = PTR_ERR(event->cft);
+		goto fail;
+	}
+
+	if (!event->cft->register_event || !event->cft->unregister_event) {
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	ret = event->cft->register_event(cgrp, event->cft,
+			event->eventfd, buffer);
+	if (ret)
+		goto fail;
+
+	if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
+		event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+		ret = 0;
+		goto fail;
+	}
+
+	/*
+	 * Events should be removed after rmdir of cgroup directory, but before
+	 * destroying subsystem state objects. Let's take reference to cgroup
+	 * directory dentry to do that.
+	 */
+	dget(cgrp->dentry);
+
+	spin_lock(&cgrp->event_list_lock);
+	list_add(&event->list, &cgrp->event_list);
+	spin_unlock(&cgrp->event_list_lock);
+
+	fput(cfile);
+	fput(efile);
+
+	return 0;
+
+fail:
+	if (cfile)
+		fput(cfile);
+
+	if (event && event->eventfd && !IS_ERR(event->eventfd))
+		eventfd_ctx_put(event->eventfd);
+
+	if (!IS_ERR_OR_NULL(efile))
+		fput(efile);
+
+	kfree(event);
+
+	return ret;
+}
+
+/*
  * for the common functions, 'private' gives the type of file
  */
 /* for hysterical raisins, we can't put this on the older files */
@@ -2813,6 +3201,11 @@ static struct cftype files[] = {
 		.read_u64 = cgroup_read_notify_on_release,
 		.write_u64 = cgroup_write_notify_on_release,
 	},
+	{
+		.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
+		.write_string = cgroup_write_event_control,
+		.mode = S_IWUGO,
+	},
 };
 
 static struct cftype cft_release_agent = {
@@ -2877,8 +3270,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
 	/* We need to take each hierarchy_mutex in a consistent order */
 	int i;
 
+	/*
+	 * No worry about a race with rebind_subsystems that might mess up the
+	 * locking order, since both parties are under cgroup_mutex.
+	 */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
+		if (ss == NULL)
+			continue;
 		if (ss->root == root)
 			mutex_lock(&ss->hierarchy_mutex);
 	}
@@ -2890,6 +3289,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
 
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
+		if (ss == NULL)
+			continue;
 		if (ss->root == root)
 			mutex_unlock(&ss->hierarchy_mutex);
 	}
@@ -2936,14 +3337,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 	for_each_subsys(root, ss) {
 		struct cgroup_subsys_state *css = ss->create(ss, cgrp);
+
 		if (IS_ERR(css)) {
 			err = PTR_ERR(css);
 			goto err_destroy;
 		}
 		init_cgroup_css(css, ss, cgrp);
-		if (ss->use_id)
-			if (alloc_css_id(ss, parent, cgrp))
+		if (ss->use_id) {
+			err = alloc_css_id(ss, parent, cgrp);
+			if (err)
 				goto err_destroy;
+		}
 		/* At error, ->destroy() callback has to free assigned ID. */
 	}
 
@@ -3010,11 +3414,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 	 * synchronization other than RCU, and the subsystem linked
 	 * list isn't RCU-safe */
 	int i;
+	/*
+	 * We won't need to lock the subsys array, because the subsystems
+	 * we're concerned about aren't going anywhere since our cgroup root
+	 * has a reference on them.
+	 */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		struct cgroup_subsys_state *css;
-		/* Skip subsystems not in this hierarchy */
-		if (ss->root != cgrp->root)
+		/* Skip subsystems not present or not in this hierarchy */
+		if (ss == NULL || ss->root != cgrp->root)
 			continue;
 		css = cgrp->subsys[ss->subsys_id];
 		/* When called from check_for_release() it's possible
@@ -3088,6 +3497,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct dentry *d;
 	struct cgroup *parent;
 	DEFINE_WAIT(wait);
+	struct cgroup_event *event, *tmp;
 	int ret;
 
 	/* the vfs holds both inode->i_mutex already */
@@ -3171,6 +3581,20 @@ again:
 	set_bit(CGRP_RELEASABLE, &parent->flags);
 	check_for_release(parent);
 
+	/*
+	 * Unregister events and notify userspace.
+	 * Notify userspace about cgroup removing only after rmdir of cgroup
+	 * directory to avoid race between userspace and kernelspace
+	 */
+	spin_lock(&cgrp->event_list_lock);
+	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
+		list_del(&event->list);
+		remove_wait_queue(event->wqh, &event->wait);
+		eventfd_signal(event->eventfd, 1);
+		schedule_work(&event->remove);
+	}
+	spin_unlock(&cgrp->event_list_lock);
+
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
@@ -3205,9 +3629,198 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	mutex_init(&ss->hierarchy_mutex);
 	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
 	ss->active = 1;
+
+	/* this function shouldn't be used with modular subsystems, since they
+	 * need to register a subsys_id, among other things */
+	BUG_ON(ss->module);
 }
 
 /**
+ * cgroup_load_subsys: load and register a modular subsystem at runtime
+ * @ss: the subsystem to load
+ *
+ * This function should be called in a modular subsystem's initcall. If the
+ * subsystem is built as a module, it will be assigned a new subsys_id and set
+ * up for use. If the subsystem is built-in anyway, work is delegated to the
+ * simpler cgroup_init_subsys.
+ */
+int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
+{
+	int i;
+	struct cgroup_subsys_state *css;
+
+	/* check name and function validity */
+	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
+	    ss->create == NULL || ss->destroy == NULL)
+		return -EINVAL;
+
+	/*
+	 * we don't support callbacks in modular subsystems. this check is
+	 * before the ss->module check for consistency; a subsystem that could
+	 * be a module should still have no callbacks even if the user isn't
+	 * compiling it as one.
+	 */
+	if (ss->fork || ss->exit)
+		return -EINVAL;
+
+	/*
+	 * an optionally modular subsystem is built-in: we want to do nothing,
+	 * since cgroup_init_subsys will have already taken care of it.
+	 */
+	if (ss->module == NULL) {
+		/* a few sanity checks */
+		BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
+		BUG_ON(subsys[ss->subsys_id] != ss);
+		return 0;
+	}
+
+	/*
+	 * need to register a subsys id before anything else - for example,
+	 * init_cgroup_css needs it.
+	 */
+	mutex_lock(&cgroup_mutex);
+	/* find the first empty slot in the array */
+	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+		if (subsys[i] == NULL)
+			break;
+	}
+	if (i == CGROUP_SUBSYS_COUNT) {
+		/* maximum number of subsystems already registered! */
+		mutex_unlock(&cgroup_mutex);
+		return -EBUSY;
+	}
+	/* assign ourselves the subsys_id */
+	ss->subsys_id = i;
+	subsys[i] = ss;
+
+	/*
+	 * no ss->create seems to need anything important in the ss struct, so
+	 * this can happen first (i.e. before the rootnode attachment).
+	 */
+	css = ss->create(ss, dummytop);
+	if (IS_ERR(css)) {
+		/* failure case - need to deassign the subsys[] slot. */
+		subsys[i] = NULL;
+		mutex_unlock(&cgroup_mutex);
+		return PTR_ERR(css);
+	}
+
+	list_add(&ss->sibling, &rootnode.subsys_list);
+	ss->root = &rootnode;
+
+	/* our new subsystem will be attached to the dummy hierarchy. */
+	init_cgroup_css(css, ss, dummytop);
+	/* init_idr must be after init_cgroup_css because it sets css->id. */
+	if (ss->use_id) {
+		int ret = cgroup_init_idr(ss, css);
+		if (ret) {
+			dummytop->subsys[ss->subsys_id] = NULL;
+			ss->destroy(ss, dummytop);
+			subsys[i] = NULL;
+			mutex_unlock(&cgroup_mutex);
+			return ret;
+		}
+	}
+
+	/*
+	 * Now we need to entangle the css into the existing css_sets. unlike
+	 * in cgroup_init_subsys, there are now multiple css_sets, so each one
+	 * will need a new pointer to it; done by iterating the css_set_table.
+	 * furthermore, modifying the existing css_sets will corrupt the hash
+	 * table state, so each changed css_set will need its hash recomputed.
+	 * this is all done under the css_set_lock.
+	 */
+	write_lock(&css_set_lock);
+	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+		struct css_set *cg;
+		struct hlist_node *node, *tmp;
+		struct hlist_head *bucket = &css_set_table[i], *new_bucket;
+
+		hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
+			/* skip entries that we already rehashed */
+			if (cg->subsys[ss->subsys_id])
+				continue;
+			/* remove existing entry */
+			hlist_del(&cg->hlist);
+			/* set new value */
+			cg->subsys[ss->subsys_id] = css;
+			/* recompute hash and restore entry */
+			new_bucket = css_set_hash(cg->subsys);
+			hlist_add_head(&cg->hlist, new_bucket);
+		}
+	}
+	write_unlock(&css_set_lock);
+
+	mutex_init(&ss->hierarchy_mutex);
+	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
+	ss->active = 1;
+
+	/* success! */
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cgroup_load_subsys);
+
+/**
+ * cgroup_unload_subsys: unload a modular subsystem
+ * @ss: the subsystem to unload
+ *
+ * This function should be called in a modular subsystem's exitcall. When this
+ * function is invoked, the refcount on the subsystem's module will be 0, so
+ * the subsystem will not be attached to any hierarchy.
+ */
+void cgroup_unload_subsys(struct cgroup_subsys *ss)
+{
+	struct cg_cgroup_link *link;
+	struct hlist_head *hhead;
+
+	BUG_ON(ss->module == NULL);
+
+	/*
+	 * we shouldn't be called if the subsystem is in use, and the use of
+	 * try_module_get in parse_cgroupfs_options should ensure that it
+	 * doesn't start being used while we're killing it off.
+	 */
+	BUG_ON(ss->root != &rootnode);
+
+	mutex_lock(&cgroup_mutex);
+	/* deassign the subsys_id */
+	BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
+	subsys[ss->subsys_id] = NULL;
+
+	/* remove subsystem from rootnode's list of subsystems */
+	list_del(&ss->sibling);
+
+	/*
+	 * disentangle the css from all css_sets attached to the dummytop. as
+	 * in loading, we need to pay our respects to the hashtable gods.
+	 */
+	write_lock(&css_set_lock);
+	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
+		struct css_set *cg = link->cg;
+
+		hlist_del(&cg->hlist);
+		BUG_ON(!cg->subsys[ss->subsys_id]);
+		cg->subsys[ss->subsys_id] = NULL;
+		hhead = css_set_hash(cg->subsys);
+		hlist_add_head(&cg->hlist, hhead);
+	}
+	write_unlock(&css_set_lock);
+
+	/*
+	 * remove subsystem's css from the dummytop and free it - need to free
+	 * before marking as null because ss->destroy needs the cgrp->subsys
+	 * pointer to find their state. note that this also takes care of
+	 * freeing the css_id.
+	 */
+	ss->destroy(ss, dummytop);
+	dummytop->subsys[ss->subsys_id] = NULL;
+
+	mutex_unlock(&cgroup_mutex);
+}
+EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
+
+/**
  * cgroup_init_early - cgroup initialization at system boot
  *
  * Initialize cgroups at system boot, and initialize any
@@ -3235,7 +3848,8 @@ int __init cgroup_init_early(void)
 	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
 		INIT_HLIST_HEAD(&css_set_table[i]);
 
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+	/* at bootup time, we don't worry about modular subsystems */
+	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 
 		BUG_ON(!ss->name);
@@ -3270,21 +3884,31 @@ int __init cgroup_init(void)
 	if (err)
 		return err;
 
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+	/* at bootup time, we don't worry about modular subsystems */
+	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
 		if (ss->use_id)
-			cgroup_subsys_init_idr(ss);
+			cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
 	}
 
 	/* Add init_css_set to the hash table */
 	hhead = css_set_hash(init_css_set.subsys);
 	hlist_add_head(&init_css_set.hlist, hhead);
 	BUG_ON(!init_root_id(&rootnode));
+
+	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
+	if (!cgroup_kobj) {
+		err = -ENOMEM;
+		goto out;
+	}
+
 	err = register_filesystem(&cgroup_fs_type);
-	if (err < 0)
+	if (err < 0) {
+		kobject_put(cgroup_kobj);
 		goto out;
+	}
 
 	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
 
@@ -3379,9 +4003,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 	int i;
 
 	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+	/*
+	 * ideally we don't want subsystems moving around while we do this.
+	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+	 * subsys/hierarchy state.
+	 */
 	mutex_lock(&cgroup_mutex);
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
+		if (ss == NULL)
+			continue;
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
 			   ss->name, ss->root->hierarchy_id,
 			   ss->root->number_of_cgroups, !ss->disabled);
@@ -3439,7 +4070,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
 {
 	if (need_forkexit_callback) {
 		int i;
-		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		/*
+		 * forkexit callbacks are only supported for builtin
+		 * subsystems, and the builtin section of the subsys array is
+		 * immutable, so we don't need to lock the subsys array here.
+		 */
+		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			if (ss->fork)
 				ss->fork(ss, child);
@@ -3508,7 +4144,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	struct css_set *cg;
 
 	if (run_callbacks && need_forkexit_callback) {
-		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		/*
+		 * modular subsystems can't use callbacks, so no need to lock
+		 * the subsys array
+		 */
+		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			if (ss->exit)
 				ss->exit(ss, tsk);
@@ -3702,12 +4342,13 @@ static void check_for_release(struct cgroup *cgrp)
 	}
 }
 
-void __css_put(struct cgroup_subsys_state *css)
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css, int count)
 {
 	struct cgroup *cgrp = css->cgroup;
 	int val;
 	rcu_read_lock();
-	val = atomic_dec_return(&css->refcnt);
+	val = atomic_sub_return(count, &css->refcnt);
 	if (val == 1) {
 		if (notify_on_release(cgrp)) {
 			set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3718,6 +4359,7 @@ void __css_put(struct cgroup_subsys_state *css)
 	rcu_read_unlock();
 	WARN_ON_ONCE(val < 1);
 }
+EXPORT_SYMBOL_GPL(__css_put);
 
 /*
  * Notify userspace when a cgroup is released, by running the
@@ -3799,8 +4441,11 @@ static int __init cgroup_disable(char *str)
 	while ((token = strsep(&str, ",")) != NULL) {
 		if (!*token)
 			continue;
-
-		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		/*
+		 * cgroup_disable, being at boot time, can't know about module
+		 * subsystems, so we don't worry about them.
+		 */
+		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 
 			if (!strcmp(token, ss->name)) {
@@ -3824,31 +4469,65 @@ __setup("cgroup_disable=", cgroup_disable);
  */
 unsigned short css_id(struct cgroup_subsys_state *css)
 {
-	struct css_id *cssid = rcu_dereference(css->id);
+	struct css_id *cssid;
+
+	/*
+	 * This css_id() can return correct value when somone has refcnt
+	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
+	 * it's unchanged until freed.
+	 */
+	cssid = rcu_dereference_check(css->id,
+			rcu_read_lock_held() || atomic_read(&css->refcnt));
 
 	if (cssid)
 		return cssid->id;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(css_id);
 
 unsigned short css_depth(struct cgroup_subsys_state *css)
 {
-	struct css_id *cssid = rcu_dereference(css->id);
+	struct css_id *cssid;
+
+	cssid = rcu_dereference_check(css->id,
+			rcu_read_lock_held() || atomic_read(&css->refcnt));
 
 	if (cssid)
 		return cssid->depth;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(css_depth);
+
+/**
+ *  css_is_ancestor - test "root" css is an ancestor of "child"
+ * @child: the css to be tested.
+ * @root: the css supporsed to be an ancestor of the child.
+ *
+ * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
+ * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
+ * But, considering usual usage, the csses should be valid objects after test.
+ * Assuming that the caller will do some action to the child if this returns
+ * returns true, the caller must take "child";s reference count.
+ * If "child" is valid object and this returns true, "root" is valid, too.
+ */
 
 bool css_is_ancestor(struct cgroup_subsys_state *child,
 		    const struct cgroup_subsys_state *root)
 {
-	struct css_id *child_id = rcu_dereference(child->id);
-	struct css_id *root_id = rcu_dereference(root->id);
+	struct css_id *child_id;
+	struct css_id *root_id;
+	bool ret = true;
 
-	if (!child_id || !root_id || (child_id->depth < root_id->depth))
-		return false;
-	return child_id->stack[root_id->depth] == root_id->id;
+	rcu_read_lock();
+	child_id  = rcu_dereference(child->id);
+	root_id = rcu_dereference(root->id);
+	if (!child_id
+	    || !root_id
+	    || (child_id->depth < root_id->depth)
+	    || (child_id->stack[root_id->depth] != root_id->id))
+		ret = false;
+	rcu_read_unlock();
+	return ret;
 }
 
 static void __free_css_id_cb(struct rcu_head *head)
@@ -3875,6 +4554,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 	spin_unlock(&ss->id_lock);
 	call_rcu(&id->rcu_head, __free_css_id_cb);
 }
+EXPORT_SYMBOL_GPL(free_css_id);
 
 /*
  * This is called by init or create(). Then, calls to this function are
@@ -3924,15 +4604,14 @@ err_out:
 
 }
 
-static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
+					    struct cgroup_subsys_state *rootcss)
 {
 	struct css_id *newid;
-	struct cgroup_subsys_state *rootcss;
 
 	spin_lock_init(&ss->id_lock);
 	idr_init(&ss->idr);
 
-	rootcss = init_css_set.subsys[ss->subsys_id];
 	newid = get_new_cssid(ss, 0);
 	if (IS_ERR(newid))
 		return PTR_ERR(newid);
@@ -3948,13 +4627,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
 {
 	int subsys_id, i, depth = 0;
 	struct cgroup_subsys_state *parent_css, *child_css;
-	struct css_id *child_id, *parent_id = NULL;
+	struct css_id *child_id, *parent_id;
 
 	subsys_id = ss->subsys_id;
 	parent_css = parent->subsys[subsys_id];
 	child_css = child->subsys[subsys_id];
-	depth = css_depth(parent_css) + 1;
 	parent_id = parent_css->id;
+	depth = parent_id->depth + 1;
 
 	child_id = get_new_cssid(ss, depth);
 	if (IS_ERR(child_id))
@@ -3992,6 +4671,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
 
 	return rcu_dereference(cssid->css);
 }
+EXPORT_SYMBOL_GPL(css_lookup);
 
 /**
  * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 59e9ef6aab4..ce71ed53e88 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
@@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task)
 			    struct freezer, css);
 }
 
-int cgroup_frozen(struct task_struct *task)
+int cgroup_freezing_or_frozen(struct task_struct *task)
 {
 	struct freezer *freezer;
 	enum freezer_state state;
 
 	task_lock(task);
 	freezer = task_freezer(task);
-	state = freezer->state;
+	if (!freezer->css.cgroup->parent)
+		state = CGROUP_THAWED; /* root cgroup can't be frozen */
+	else
+		state = freezer->state;
 	task_unlock(task);
 
-	return state == CGROUP_FROZEN;
+	return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
 }
 
 /*
@@ -85,10 +89,10 @@ struct cgroup_subsys freezer_subsys;
 
 /* Locks taken and their ordering
  * ------------------------------
- * css_set_lock
  * cgroup_mutex (AKA cgroup_lock)
- * task->alloc_lock (AKA task_lock)
  * freezer->lock
+ * css_set_lock
+ * task->alloc_lock (AKA task_lock)
  * task->sighand->siglock
  *
  * cgroup code forces css_set_lock to be taken before task->alloc_lock
@@ -96,33 +100,38 @@ struct cgroup_subsys freezer_subsys;
  * freezer_create(), freezer_destroy():
  * cgroup_mutex [ by cgroup core ]
  *
- * can_attach():
- * cgroup_mutex
+ * freezer_can_attach():
+ * cgroup_mutex (held by caller of can_attach)
  *
- * cgroup_frozen():
+ * cgroup_freezing_or_frozen():
  * task->alloc_lock (to get task's cgroup)
  *
  * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
- * task->alloc_lock (to get task's cgroup)
  * freezer->lock
  *  sighand->siglock (if the cgroup is freezing)
  *
  * freezer_read():
  * cgroup_mutex
  *  freezer->lock
+ *   write_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock
  *   read_lock css_set_lock (cgroup iterator start)
  *
  * freezer_write() (freeze):
  * cgroup_mutex
  *  freezer->lock
+ *   write_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock
  *   read_lock css_set_lock (cgroup iterator start)
- *    sighand->siglock
+ *    sighand->siglock (fake signal delivery inside freeze_task())
  *
  * freezer_write() (unfreeze):
  * cgroup_mutex
  *  freezer->lock
+ *   write_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock
  *   read_lock css_set_lock (cgroup iterator start)
- *    task->alloc_lock (to prevent races with freeze_task())
+ *    task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
  *     sighand->siglock
  */
 static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -201,9 +210,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
 	 * No lock is needed, since the task isn't on tasklist yet,
 	 * so it can't be moved to another cgroup, which means the
 	 * freezer won't be removed and will be valid during this
-	 * function call.
+	 * function call.  Nevertheless, apply RCU read-side critical
+	 * section to suppress RCU lockdep false positives.
 	 */
+	rcu_read_lock();
 	freezer = task_freezer(task);
+	rcu_read_unlock();
 
 	/*
 	 * The root cgroup is non-freezable, so we can skip the
diff --git a/kernel/compat.c b/kernel/compat.c
index f6c204f07ea..c9e2ec0b34a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -25,6 +25,7 @@
 #include <linux/posix-timers.h>
 #include <linux/times.h>
 #include <linux/ptrace.h>
+#include <linux/gfp.h>
 
 #include <asm/uaccess.h>
 
@@ -278,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
 		struct compat_rlimit __user *rlim)
 {
 	struct rlimit r;
-	int ret;
-	mm_segment_t old_fs = get_fs ();
-
-	if (resource >= RLIM_NLIMITS)
-		return -EINVAL;
 
 	if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
 	    __get_user(r.rlim_cur, &rlim->rlim_cur) ||
@@ -293,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
 		r.rlim_cur = RLIM_INFINITY;
 	if (r.rlim_max == COMPAT_RLIM_INFINITY)
 		r.rlim_max = RLIM_INFINITY;
-	set_fs(KERNEL_DS);
-	ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
-	set_fs(old_fs);
-	return ret;
+	return do_prlimit(current, resource, &r, NULL);
 }
 
 #ifdef COMPAT_RLIM_OLD_INFINITY
@@ -328,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
 
 #endif
 
-asmlinkage long compat_sys_getrlimit (unsigned int resource,
+asmlinkage long compat_sys_getrlimit(unsigned int resource,
 		struct compat_rlimit __user *rlim)
 {
 	struct rlimit r;
 	int ret;
-	mm_segment_t old_fs = get_fs();
 
-	set_fs(KERNEL_DS);
-	ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
-	set_fs(old_fs);
+	ret = do_prlimit(current, resource, NULL, &r);
 	if (!ret) {
 		if (r.rlim_cur > COMPAT_RLIM_INFINITY)
 			r.rlim_cur = COMPAT_RLIM_INFINITY;
@@ -494,29 +484,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
 {
 	int ret;
 	cpumask_var_t mask;
-	unsigned long *k;
-	unsigned int min_length = cpumask_size();
-
-	if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
-		min_length = sizeof(compat_ulong_t);
 
-	if (len < min_length)
+	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+		return -EINVAL;
+	if (len & (sizeof(compat_ulong_t)-1))
 		return -EINVAL;
 
 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 		return -ENOMEM;
 
 	ret = sched_getaffinity(pid, mask);
-	if (ret < 0)
-		goto out;
+	if (ret == 0) {
+		size_t retlen = min_t(size_t, len, cpumask_size());
 
-	k = cpumask_bits(mask);
-	ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
-	if (ret == 0)
-		ret = min_length;
-
-out:
+		if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
+			ret = -EFAULT;
+		else
+			ret = retlen;
+	}
 	free_cpumask_var(mask);
+
 	return ret;
 }
 
@@ -1139,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
 
 	return 0;
 }
+
+/*
+ * Allocate user-space memory for the duration of a single system call,
+ * in order to marshall parameters inside a compat thunk.
+ */
+void __user *compat_alloc_user_space(unsigned long len)
+{
+	void __user *ptr;
+
+	/* If len would occupy more than half of the entire compat space... */
+	if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
+		return NULL;
+
+	ptr = arch_compat_alloc_user_space(len);
+
+	if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
+		return NULL;
+
+	return ptr;
+}
+EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1c8ddd6ee94..f6e726f1849 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,18 +14,35 @@
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
+#include <linux/gfp.h>
 
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 
-static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
+/*
+ * The following two API's must be used when attempting
+ * to serialize the updates to cpu_online_mask, cpu_present_mask.
+ */
+void cpu_maps_update_begin(void)
+{
+	mutex_lock(&cpu_add_remove_lock);
+}
+
+void cpu_maps_update_done(void)
+{
+	mutex_unlock(&cpu_add_remove_lock);
+}
+
+static RAW_NOTIFIER_HEAD(cpu_chain);
 
 /* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
  * Should always be manipulated under cpu_add_remove_lock
  */
 static int cpu_hotplug_disabled;
 
+#ifdef CONFIG_HOTPLUG_CPU
+
 static struct {
 	struct task_struct *active_writer;
 	struct mutex lock; /* Synchronizes accesses to refcount, */
@@ -40,8 +57,6 @@ static struct {
 	.refcount = 0,
 };
 
-#ifdef CONFIG_HOTPLUG_CPU
-
 void get_online_cpus(void)
 {
 	might_sleep();
@@ -66,22 +81,6 @@ void put_online_cpus(void)
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
 
-#endif	/* CONFIG_HOTPLUG_CPU */
-
-/*
- * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_mask, cpu_present_mask.
- */
-void cpu_maps_update_begin(void)
-{
-	mutex_lock(&cpu_add_remove_lock);
-}
-
-void cpu_maps_update_done(void)
-{
-	mutex_unlock(&cpu_add_remove_lock);
-}
-
 /*
  * This ensures that the hotplug operation can begin only when the
  * refcount goes to zero.
@@ -123,6 +122,12 @@ static void cpu_hotplug_done(void)
 	cpu_hotplug.active_writer = NULL;
 	mutex_unlock(&cpu_hotplug.lock);
 }
+
+#else /* #if CONFIG_HOTPLUG_CPU */
+static void cpu_hotplug_begin(void) {}
+static void cpu_hotplug_done(void) {}
+#endif	/* #esle #if CONFIG_HOTPLUG_CPU */
+
 /* Need to know about CPUs going up/down? */
 int __ref register_cpu_notifier(struct notifier_block *nb)
 {
@@ -133,8 +138,29 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
 	return ret;
 }
 
+static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
+			int *nr_calls)
+{
+	int ret;
+
+	ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
+					nr_calls);
+
+	return notifier_to_errno(ret);
+}
+
+static int cpu_notify(unsigned long val, void *v)
+{
+	return __cpu_notify(val, v, -1, NULL);
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 
+static void cpu_notify_nofail(unsigned long val, void *v)
+{
+	BUG_ON(cpu_notify(val, v));
+}
+
 EXPORT_SYMBOL(register_cpu_notifier);
 
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -151,18 +177,19 @@ static inline void check_for_tasks(int cpu)
 
 	write_lock_irq(&tasklist_lock);
 	for_each_process(p) {
-		if (task_cpu(p) == cpu &&
+		if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
 		    (!cputime_eq(p->utime, cputime_zero) ||
 		     !cputime_eq(p->stime, cputime_zero)))
-			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
-				(state = %ld, flags = %x) \n",
-				 p->comm, task_pid_nr(p), cpu,
-				 p->state, p->flags);
+			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
+				"(state = %ld, flags = %x)\n",
+				p->comm, task_pid_nr(p), cpu,
+				p->state, p->flags);
 	}
 	write_unlock_irq(&tasklist_lock);
 }
 
 struct take_cpu_down_param {
+	struct task_struct *caller;
 	unsigned long mod;
 	void *hcpu;
 };
@@ -171,6 +198,7 @@ struct take_cpu_down_param {
 static int __ref take_cpu_down(void *_param)
 {
 	struct take_cpu_down_param *param = _param;
+	unsigned int cpu = (unsigned long)param->hcpu;
 	int err;
 
 	/* Ensure this CPU doesn't handle any more interrupts. */
@@ -178,9 +206,10 @@ static int __ref take_cpu_down(void *_param)
 	if (err < 0)
 		return err;
 
-	raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
-				param->hcpu);
+	cpu_notify(CPU_DYING | param->mod, param->hcpu);
 
+	if (task_cpu(param->caller) == cpu)
+		move_task_off_dead_cpu(cpu, param->caller);
 	/* Force idle task to run as soon as we yield: it should
 	   immediately notice cpu is offline and die quickly. */
 	sched_idle_next();
@@ -191,10 +220,10 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
-	cpumask_var_t old_allowed;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 	struct take_cpu_down_param tcd_param = {
+		.caller = current,
 		.mod = mod,
 		.hcpu = hcpu,
 	};
@@ -205,38 +234,22 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
-	if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
-		return -ENOMEM;
-
 	cpu_hotplug_begin();
-	set_cpu_active(cpu, false);
-	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
-					hcpu, -1, &nr_calls);
-	if (err == NOTIFY_BAD) {
-		set_cpu_active(cpu, true);
-
+	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
+	if (err) {
 		nr_calls--;
-		__raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
-					  hcpu, nr_calls, NULL);
+		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
 		printk("%s: attempt to take down CPU %u failed\n",
 				__func__, cpu);
-		err = -EINVAL;
 		goto out_release;
 	}
 
-	/* Ensure that we are not runnable on dying cpu */
-	cpumask_copy(old_allowed, &current->cpus_allowed);
-	set_cpus_allowed_ptr(current, cpu_active_mask);
-
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
-		set_cpu_active(cpu, true);
 		/* CPU didn't die: tell everyone.  Can't complain. */
-		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
-					    hcpu) == NOTIFY_BAD)
-			BUG();
+		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
 
-		goto out_allowed;
+		goto out_release;
 	}
 	BUG_ON(cpu_online(cpu));
 
@@ -248,22 +261,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	__cpu_die(cpu);
 
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
-	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
-				    hcpu) == NOTIFY_BAD)
-		BUG();
+	cpu_notify_nofail(CPU_DEAD | mod, hcpu);
 
 	check_for_tasks(cpu);
 
-out_allowed:
-	set_cpus_allowed_ptr(current, old_allowed);
 out_release:
 	cpu_hotplug_done();
-	if (!err) {
-		if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
-					    hcpu) == NOTIFY_BAD)
-			BUG();
-	}
-	free_cpumask_var(old_allowed);
+	if (!err)
+		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
 	return err;
 }
 
@@ -271,9 +276,6 @@ int __ref cpu_down(unsigned int cpu)
 {
 	int err;
 
-	err = stop_machine_create();
-	if (err)
-		return err;
 	cpu_maps_update_begin();
 
 	if (cpu_hotplug_disabled) {
@@ -285,7 +287,6 @@ int __ref cpu_down(unsigned int cpu)
 
 out:
 	cpu_maps_update_done();
-	stop_machine_destroy();
 	return err;
 }
 EXPORT_SYMBOL(cpu_down);
@@ -302,13 +303,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 		return -EINVAL;
 
 	cpu_hotplug_begin();
-	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
-							-1, &nr_calls);
-	if (ret == NOTIFY_BAD) {
+	ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
+	if (ret) {
 		nr_calls--;
 		printk("%s: attempt to bring up CPU %u failed\n",
 				__func__, cpu);
-		ret = -EINVAL;
 		goto out_notify;
 	}
 
@@ -318,15 +317,12 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
 
-	set_cpu_active(cpu, true);
-
 	/* Now call notifier in preparation. */
-	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
+	cpu_notify(CPU_ONLINE | mod, hcpu);
 
 out_notify:
 	if (ret != 0)
-		__raw_notifier_call_chain(&cpu_chain,
-				CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+		__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
 	cpu_hotplug_done();
 
 	return ret;
@@ -335,16 +331,44 @@ out_notify:
 int __cpuinit cpu_up(unsigned int cpu)
 {
 	int err = 0;
+
+#ifdef	CONFIG_MEMORY_HOTPLUG
+	int nid;
+	pg_data_t	*pgdat;
+#endif
+
 	if (!cpu_possible(cpu)) {
 		printk(KERN_ERR "can't online cpu %d because it is not "
 			"configured as may-hotadd at boot time\n", cpu);
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#if defined(CONFIG_IA64)
 		printk(KERN_ERR "please check additional_cpus= boot "
 				"parameter\n");
 #endif
 		return -EINVAL;
 	}
 
+#ifdef	CONFIG_MEMORY_HOTPLUG
+	nid = cpu_to_node(cpu);
+	if (!node_online(nid)) {
+		err = mem_online_node(nid);
+		if (err)
+			return err;
+	}
+
+	pgdat = NODE_DATA(nid);
+	if (!pgdat) {
+		printk(KERN_ERR
+			"Can't online cpu %d due to NULL pgdat\n", cpu);
+		return -ENOMEM;
+	}
+
+	if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
+		mutex_lock(&zonelists_mutex);
+		build_all_zonelists(NULL);
+		mutex_unlock(&zonelists_mutex);
+	}
+#endif
+
 	cpu_maps_update_begin();
 
 	if (cpu_hotplug_disabled) {
@@ -364,11 +388,8 @@ static cpumask_var_t frozen_cpus;
 
 int disable_nonboot_cpus(void)
 {
-	int cpu, first_cpu, error;
+	int cpu, first_cpu, error = 0;
 
-	error = stop_machine_create();
-	if (error)
-		return error;
 	cpu_maps_update_begin();
 	first_cpu = cpumask_first(cpu_online_mask);
 	/*
@@ -399,7 +420,6 @@ int disable_nonboot_cpus(void)
 		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
 	}
 	cpu_maps_update_done();
-	stop_machine_destroy();
 	return error;
 }
 
@@ -466,7 +486,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
 	if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
 		val = CPU_STARTING_FROZEN;
 #endif /* CONFIG_PM_SLEEP_SMP */
-	raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
+	cpu_notify(val, (void *)(long)cpu);
 }
 
 #endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba401fab459..b23c0979bbe 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -105,7 +105,7 @@ struct cpuset {
 	/* for custom sched domain */
 	int relax_domain_level;
 
-	/* used for walking a cpuset heirarchy */
+	/* used for walking a cpuset hierarchy */
 	struct list_head stack_list;
 };
 
@@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  *    call to guarantee_online_mems(), as we know no one is changing
  *    our task's cpuset.
  *
- *    Hold callback_mutex around the two modifications of our tasks
- *    mems_allowed to synchronize with cpuset_mems_allowed().
- *
  *    While the mm_struct we are migrating is typically from some
  *    other task, the task_struct mems_allowed that we are hacking
  *    is for our current task, which must allocate new pages for that
@@ -949,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
  * In order to avoid seeing no nodes if the old and new nodes are disjoint,
  * we structure updates as setting all new allowed nodes, then clearing newly
  * disallowed ones.
- *
- * Called with task's alloc_lock held
  */
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
 					nodemask_t *newmems)
 {
+repeat:
+	/*
+	 * Allow tasks that have access to memory reserves because they have
+	 * been OOM killed to get memory anywhere.
+	 */
+	if (unlikely(test_thread_flag(TIF_MEMDIE)))
+		return;
+	if (current->flags & PF_EXITING) /* Let dying task have memory */
+		return;
+
+	task_lock(tsk);
 	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, &tsk->mems_allowed);
-	mpol_rebind_task(tsk, newmems);
+	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
+
+
+	/*
+	 * ensure checking ->mems_allowed_change_disable after setting all new
+	 * allowed nodes.
+	 *
+	 * the read-side task can see an nodemask with new allowed nodes and
+	 * old allowed nodes. and if it allocates page when cpuset clears newly
+	 * disallowed ones continuous, it can see the new allowed bits.
+	 *
+	 * And if setting all new allowed nodes is after the checking, setting
+	 * all new allowed nodes and clearing newly disallowed ones will be done
+	 * continuous, and the read-side task may find no node to alloc page.
+	 */
+	smp_mb();
+
+	/*
+	 * Allocation of memory is very fast, we needn't sleep when waiting
+	 * for the read-side.
+	 */
+	while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+		task_unlock(tsk);
+		if (!task_curr(tsk))
+			yield();
+		goto repeat;
+	}
+
+	/*
+	 * ensure checking ->mems_allowed_change_disable before clearing all new
+	 * disallowed nodes.
+	 *
+	 * if clearing newly disallowed bits before the checking, the read-side
+	 * task may find no node to alloc page.
+	 */
+	smp_mb();
+
+	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
 	tsk->mems_allowed = *newmems;
+	task_unlock(tsk);
 }
 
 /*
@@ -973,14 +1016,17 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
-	nodemask_t newmems;
+	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
+
+	if (!newmems)
+		return;
 
 	cs = cgroup_cs(scan->cg);
-	guarantee_online_mems(cs, &newmems);
+	guarantee_online_mems(cs, newmems);
+
+	cpuset_change_task_nodemask(p, newmems);
 
-	task_lock(p);
-	cpuset_change_task_nodemask(p, &newmems);
-	task_unlock(p);
+	NODEMASK_FREE(newmems);
 
 	mm = get_task_mm(p);
 	if (!mm)
@@ -1051,16 +1097,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 			   const char *buf)
 {
-	nodemask_t oldmem;
+	NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
 	int retval;
 	struct ptr_heap heap;
 
+	if (!oldmem)
+		return -ENOMEM;
+
 	/*
 	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
 	 * it's read-only
 	 */
-	if (cs == &top_cpuset)
-		return -EACCES;
+	if (cs == &top_cpuset) {
+		retval = -EACCES;
+		goto done;
+	}
 
 	/*
 	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1076,11 +1127,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 			goto done;
 
 		if (!nodes_subset(trialcs->mems_allowed,
-				node_states[N_HIGH_MEMORY]))
-			return -EINVAL;
+				node_states[N_HIGH_MEMORY])) {
+			retval =  -EINVAL;
+			goto done;
+		}
 	}
-	oldmem = cs->mems_allowed;
-	if (nodes_equal(oldmem, trialcs->mems_allowed)) {
+	*oldmem = cs->mems_allowed;
+	if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
 		retval = 0;		/* Too easy - nothing to do */
 		goto done;
 	}
@@ -1096,10 +1149,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 	cs->mems_allowed = trialcs->mems_allowed;
 	mutex_unlock(&callback_mutex);
 
-	update_tasks_nodemask(cs, &oldmem, &heap);
+	update_tasks_nodemask(cs, oldmem, &heap);
 
 	heap_free(&heap);
 done:
+	NODEMASK_FREE(oldmem);
 	return retval;
 }
 
@@ -1373,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
 	err = set_cpus_allowed_ptr(tsk, cpus_attach);
 	WARN_ON_ONCE(err);
 
-	task_lock(tsk);
 	cpuset_change_task_nodemask(tsk, to);
-	task_unlock(tsk);
 	cpuset_update_task_spread_flag(cs, tsk);
 
 }
@@ -1384,40 +1436,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 			  struct cgroup *oldcont, struct task_struct *tsk,
 			  bool threadgroup)
 {
-	nodemask_t from, to;
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
+	NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
+	NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
+
+	if (from == NULL || to == NULL)
+		goto alloc_fail;
 
 	if (cs == &top_cpuset) {
 		cpumask_copy(cpus_attach, cpu_possible_mask);
-		to = node_possible_map;
 	} else {
 		guarantee_online_cpus(cs, cpus_attach);
-		guarantee_online_mems(cs, &to);
 	}
+	guarantee_online_mems(cs, to);
 
 	/* do per-task migration stuff possibly for each in the threadgroup */
-	cpuset_attach_task(tsk, &to, cs);
+	cpuset_attach_task(tsk, to, cs);
 	if (threadgroup) {
 		struct task_struct *c;
 		rcu_read_lock();
 		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-			cpuset_attach_task(c, &to, cs);
+			cpuset_attach_task(c, to, cs);
 		}
 		rcu_read_unlock();
 	}
 
 	/* change mm; only needs to be done once even if threadgroup */
-	from = oldcs->mems_allowed;
-	to = cs->mems_allowed;
+	*from = oldcs->mems_allowed;
+	*to = cs->mems_allowed;
 	mm = get_task_mm(tsk);
 	if (mm) {
-		mpol_rebind_mm(mm, &to);
+		mpol_rebind_mm(mm, to);
 		if (is_memory_migrate(cs))
-			cpuset_migrate_mm(mm, &from, &to);
+			cpuset_migrate_mm(mm, from, to);
 		mmput(mm);
 	}
+
+alloc_fail:
+	NODEMASK_FREE(from);
+	NODEMASK_FREE(to);
 }
 
 /* The various types of files and directories in a cpuset file system */
@@ -1562,13 +1621,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
-	nodemask_t mask;
+	NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
+	int retval;
+
+	if (mask == NULL)
+		return -ENOMEM;
 
 	mutex_lock(&callback_mutex);
-	mask = cs->mems_allowed;
+	*mask = cs->mems_allowed;
 	mutex_unlock(&callback_mutex);
 
-	return nodelist_scnprintf(page, PAGE_SIZE, mask);
+	retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
+
+	NODEMASK_FREE(mask);
+
+	return retval;
 }
 
 static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1997,7 +2064,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 	struct cpuset *cp;	/* scans cpusets being updated */
 	struct cpuset *child;	/* scans child cpusets of cp */
 	struct cgroup *cont;
-	nodemask_t oldmems;
+	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+
+	if (oldmems == NULL)
+		return;
 
 	list_add_tail((struct list_head *)&root->stack_list, &queue);
 
@@ -2014,7 +2084,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
 			continue;
 
-		oldmems = cp->mems_allowed;
+		*oldmems = cp->mems_allowed;
 
 		/* Remove offline cpus and mems from this cpuset. */
 		mutex_lock(&callback_mutex);
@@ -2030,9 +2100,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 			remove_tasks_in_empty_cpuset(cp);
 		else {
 			update_tasks_cpumask(cp, NULL);
-			update_tasks_nodemask(cp, &oldmems, NULL);
+			update_tasks_nodemask(cp, oldmems, NULL);
 		}
 	}
+	NODEMASK_FREE(oldmems);
 }
 
 /*
@@ -2042,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
  * but making no active use of cpusets.
  *
  * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_online_map on each CPU hotplug (cpuhp) event.
+ * cpu_active_mask on each CPU hotplug (cpuhp) event.
  *
  * Called within get_online_cpus().  Needs to call cgroup_lock()
  * before calling generate_sched_domains().
  */
-static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
-				unsigned long phase, void *unused_cpu)
+void cpuset_update_active_cpus(void)
 {
 	struct sched_domain_attr *attr;
 	cpumask_var_t *doms;
 	int ndoms;
 
-	switch (phase) {
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		break;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
 	cgroup_lock();
 	mutex_lock(&callback_mutex);
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2077,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 
 	/* Have scheduler rebuild the domains */
 	partition_sched_domains(ndoms, doms, attr);
-
-	return NOTIFY_OK;
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -2090,20 +2145,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 static int cpuset_track_online_nodes(struct notifier_block *self,
 				unsigned long action, void *arg)
 {
+	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+
+	if (oldmems == NULL)
+		return NOTIFY_DONE;
+
 	cgroup_lock();
 	switch (action) {
 	case MEM_ONLINE:
-	case MEM_OFFLINE:
+		*oldmems = top_cpuset.mems_allowed;
 		mutex_lock(&callback_mutex);
 		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 		mutex_unlock(&callback_mutex);
-		if (action == MEM_OFFLINE)
-			scan_for_empty_cpusets(&top_cpuset);
+		update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+		break;
+	case MEM_OFFLINE:
+		/*
+		 * needn't update top_cpuset.mems_allowed explicitly because
+		 * scan_for_empty_cpusets() will update it.
+		 */
+		scan_for_empty_cpusets(&top_cpuset);
 		break;
 	default:
 		break;
 	}
 	cgroup_unlock();
+
+	NODEMASK_FREE(oldmems);
 	return NOTIFY_OK;
 }
 #endif
@@ -2119,7 +2187,6 @@ void __init cpuset_init_smp(void)
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
 	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 
-	hotcpu_notifier(cpuset_track_online_cpus, 0);
 	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
 
 	cpuset_wq = create_singlethread_workqueue("cpuset");
@@ -2140,19 +2207,52 @@ void __init cpuset_init_smp(void)
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
 	mutex_lock(&callback_mutex);
-	cpuset_cpus_allowed_locked(tsk, pmask);
+	task_lock(tsk);
+	guarantee_online_cpus(task_cs(tsk), pmask);
+	task_unlock(tsk);
 	mutex_unlock(&callback_mutex);
 }
 
-/**
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
-	task_lock(tsk);
-	guarantee_online_cpus(task_cs(tsk), pmask);
-	task_unlock(tsk);
+	const struct cpuset *cs;
+	int cpu;
+
+	rcu_read_lock();
+	cs = task_cs(tsk);
+	if (cs)
+		cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+	rcu_read_unlock();
+
+	/*
+	 * We own tsk->cpus_allowed, nobody can change it under us.
+	 *
+	 * But we used cs && cs->cpus_allowed lockless and thus can
+	 * race with cgroup_attach_task() or update_cpumask() and get
+	 * the wrong tsk->cpus_allowed. However, both cases imply the
+	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+	 * which takes task_rq_lock().
+	 *
+	 * If we are called after it dropped the lock we must see all
+	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+	 * set any mask even if it is not right from task_cs() pov,
+	 * the pending set_cpus_allowed_ptr() will fix things.
+	 */
+
+	cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+	if (cpu >= nr_cpu_ids) {
+		/*
+		 * Either tsk->cpus_allowed is wrong (see above) or it
+		 * is actually empty. The latter case is only possible
+		 * if we are racing with remove_tasks_in_empty_cpuset().
+		 * Like above we can temporary set any mask and rely on
+		 * set_cpus_allowed_ptr() as synchronization point.
+		 */
+		cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+		cpu = cpumask_any(cpu_active_mask);
+	}
+
+	return cpu;
 }
 
 void cpuset_init_current_mems_allowed(void)
@@ -2341,22 +2441,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 }
 
 /**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
-	mutex_lock(&callback_mutex);
-}
-
-/**
  * cpuset_unlock - release lock on cpuset changes
  *
  * Undo the lock taken in a previous cpuset_lock() call.
@@ -2368,7 +2452,8 @@ void cpuset_unlock(void)
 }
 
 /**
- * cpuset_mem_spread_node() - On which node to begin search for a page
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
  *
  * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
  * tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2393,16 +2478,27 @@ void cpuset_unlock(void)
  * See kmem_cache_alloc_node().
  */
 
-int cpuset_mem_spread_node(void)
+static int cpuset_spread_node(int *rotor)
 {
 	int node;
 
-	node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+	node = next_node(*rotor, current->mems_allowed);
 	if (node == MAX_NUMNODES)
 		node = first_node(current->mems_allowed);
-	current->cpuset_mem_spread_rotor = node;
+	*rotor = node;
 	return node;
 }
+
+int cpuset_mem_spread_node(void)
+{
+	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
+}
+
+int cpuset_slab_spread_node(void)
+{
+	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
+}
+
 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
 
 /**
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf..00000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Internal credentials stuff
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-/*
- * user.c
- */
-static inline void sched_switch_user(struct task_struct *p)
-{
-#ifdef CONFIG_USER_SCHED
-	sched_move_task(p);
-#endif	/* CONFIG_USER_SCHED */
-}
-
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b..9a3e22641fe 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -10,22 +10,18 @@
  */
 #include <linux/module.h>
 #include <linux/cred.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/key.h>
 #include <linux/keyctl.h>
 #include <linux/init_task.h>
 #include <linux/security.h>
 #include <linux/cn_proc.h>
-#include "cred-internals.h"
 
 #if 0
 #define kdebug(FMT, ...) \
 	printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
 #else
-static inline __attribute__((format(printf, 1, 2)))
-void no_printk(const char *fmt, ...)
-{
-}
 #define kdebug(FMT, ...) \
 	no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
 #endif
@@ -209,6 +205,31 @@ void exit_creds(struct task_struct *tsk)
 	}
 }
 
+/**
+ * get_task_cred - Get another task's objective credentials
+ * @task: The task to query
+ *
+ * Get the objective credentials of a task, pinning them so that they can't go
+ * away.  Accessing a task's credentials directly is not permitted.
+ *
+ * The caller must also make sure task doesn't get deleted, either by holding a
+ * ref on task or by holding tasklist_lock to prevent it from being unlinked.
+ */
+const struct cred *get_task_cred(struct task_struct *task)
+{
+	const struct cred *cred;
+
+	rcu_read_lock();
+
+	do {
+		cred = __task_cred((task));
+		BUG_ON(!cred);
+	} while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+
+	rcu_read_unlock();
+	return cred;
+}
+
 /*
  * Allocate blank credentials, such that the credentials can be filled in at a
  * later date without risk of ENOMEM.
@@ -224,7 +245,7 @@ struct cred *cred_alloc_blank(void)
 #ifdef CONFIG_KEYS
 	new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
 	if (!new->tgcred) {
-		kfree(new);
+		kmem_cache_free(cred_jar, new);
 		return NULL;
 	}
 	atomic_set(&new->tgcred->usage, 1);
@@ -347,60 +368,6 @@ struct cred *prepare_exec_creds(void)
 }
 
 /*
- * prepare new credentials for the usermode helper dispatcher
- */
-struct cred *prepare_usermodehelper_creds(void)
-{
-#ifdef CONFIG_KEYS
-	struct thread_group_cred *tgcred = NULL;
-#endif
-	struct cred *new;
-
-#ifdef CONFIG_KEYS
-	tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
-	if (!tgcred)
-		return NULL;
-#endif
-
-	new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
-	if (!new)
-		return NULL;
-
-	kdebug("prepare_usermodehelper_creds() alloc %p", new);
-
-	memcpy(new, &init_cred, sizeof(struct cred));
-
-	atomic_set(&new->usage, 1);
-	set_cred_subscribers(new, 0);
-	get_group_info(new->group_info);
-	get_uid(new->user);
-
-#ifdef CONFIG_KEYS
-	new->thread_keyring = NULL;
-	new->request_key_auth = NULL;
-	new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
-
-	atomic_set(&tgcred->usage, 1);
-	spin_lock_init(&tgcred->lock);
-	new->tgcred = tgcred;
-#endif
-
-#ifdef CONFIG_SECURITY
-	new->security = NULL;
-#endif
-	if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
-		goto error;
-	validate_creds(new);
-
-	BUG_ON(atomic_read(&new->usage) != 1);
-	return new;
-
-error:
-	put_cred(new);
-	return NULL;
-}
-
-/*
  * Copy credentials for the new process created by fork()
  *
  * We share if we can, but under some circumstances we have to generate a new
@@ -516,8 +483,6 @@ int commit_creds(struct cred *new)
 #endif
 	BUG_ON(atomic_read(&new->usage) < 1);
 
-	security_commit_creds(new, old);
-
 	get_cred(new); /* we will require a ref for the subj creds too */
 
 	/* dumpability changes */
@@ -553,8 +518,6 @@ int commit_creds(struct cred *new)
 		atomic_dec(&old->user->processes);
 	alter_cred_subscribers(old, -2);
 
-	sched_switch_user(task);
-
 	/* send notifications */
 	if (new->uid   != old->uid  ||
 	    new->euid  != old->euid ||
@@ -786,8 +749,6 @@ bool creds_are_invalid(const struct cred *cred)
 {
 	if (cred->magic != CRED_MAGIC)
 		return true;
-	if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
-		return true;
 #ifdef CONFIG_SECURITY_SELINUX
 	if (selinux_is_enabled()) {
 		if ((unsigned long) cred->security < PAGE_SIZE)
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
new file mode 100644
index 00000000000..a85edc33998
--- /dev/null
+++ b/kernel/debug/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the linux kernel debugger
+#
+
+obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
+obj-$(CONFIG_KGDB_KDB) += kdb/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
new file mode 100644
index 00000000000..de407c78178
--- /dev/null
+++ b/kernel/debug/debug_core.c
@@ -0,0 +1,985 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ *  Jason Wessel ( jason.wessel@windriver.com )
+ *  George Anzinger <george@mvista.com>
+ *  Anurekh Saxena (anurekh.saxena@timesys.com)
+ *  Lake Stevens Instrument Division (Glenn Engel)
+ *  Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+#include <linux/pid_namespace.h>
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/console.h>
+#include <linux/threads.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/pid.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm/cacheflush.h>
+#include <asm/byteorder.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+#include "debug_core.h"
+
+static int kgdb_break_asap;
+
+struct debuggerinfo_struct kgdb_info[NR_CPUS];
+
+/**
+ * kgdb_connected - Is a host GDB connected to us?
+ */
+int				kgdb_connected;
+EXPORT_SYMBOL_GPL(kgdb_connected);
+
+/* All the KGDB handlers are installed */
+int			kgdb_io_module_registered;
+
+/* Guard for recursive entry */
+static int			exception_level;
+
+struct kgdb_io		*dbg_io_ops;
+static DEFINE_SPINLOCK(kgdb_registration_lock);
+
+/* kgdb console driver is loaded */
+static int kgdb_con_registered;
+/* determine if kgdb console output should be used */
+static int kgdb_use_con;
+/* Flag for alternate operations for early debugging */
+bool dbg_is_early = true;
+/* Next cpu to become the master debug core */
+int dbg_switch_cpu;
+
+/* Use kdb or gdbserver mode */
+int dbg_kdb_mode = 1;
+
+static int __init opt_kgdb_con(char *str)
+{
+	kgdb_use_con = 1;
+	return 0;
+}
+
+early_param("kgdbcon", opt_kgdb_con);
+
+module_param(kgdb_use_con, int, 0644);
+
+/*
+ * Holds information about breakpoints in a kernel. These breakpoints are
+ * added and removed by gdb.
+ */
+static struct kgdb_bkpt		kgdb_break[KGDB_MAX_BREAKPOINTS] = {
+	[0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
+};
+
+/*
+ * The CPU# of the active CPU, or -1 if none:
+ */
+atomic_t			kgdb_active = ATOMIC_INIT(-1);
+EXPORT_SYMBOL_GPL(kgdb_active);
+
+/*
+ * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
+ * bootup code (which might not have percpu set up yet):
+ */
+static atomic_t			passive_cpu_wait[NR_CPUS];
+static atomic_t			cpu_in_kgdb[NR_CPUS];
+static atomic_t			kgdb_break_tasklet_var;
+atomic_t			kgdb_setting_breakpoint;
+
+struct task_struct		*kgdb_usethread;
+struct task_struct		*kgdb_contthread;
+
+int				kgdb_single_step;
+static pid_t			kgdb_sstep_pid;
+
+/* to keep track of the CPU which is doing the single stepping*/
+atomic_t			kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
+
+/*
+ * If you are debugging a problem where roundup (the collection of
+ * all other CPUs) is a problem [this should be extremely rare],
+ * then use the nokgdbroundup option to avoid roundup. In that case
+ * the other CPUs might interfere with your debugging context, so
+ * use this with care:
+ */
+static int kgdb_do_roundup = 1;
+
+static int __init opt_nokgdbroundup(char *str)
+{
+	kgdb_do_roundup = 0;
+
+	return 0;
+}
+
+early_param("nokgdbroundup", opt_nokgdbroundup);
+
+/*
+ * Finally, some KGDB code :-)
+ */
+
+/*
+ * Weak aliases for breakpoint management,
+ * can be overriden by architectures when needed:
+ */
+int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
+{
+	int err;
+
+	err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
+	if (err)
+		return err;
+
+	return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
+				  BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
+{
+	return probe_kernel_write((char *)addr,
+				  (char *)bundle, BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+	char tmp_variable[BREAK_INSTR_SIZE];
+	int err;
+	/* Validate setting the breakpoint and then removing it.  In the
+	 * remove fails, the kernel needs to emit a bad message because we
+	 * are deep trouble not being able to put things back the way we
+	 * found them.
+	 */
+	err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+	if (err)
+		return err;
+	err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+	if (err)
+		printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+		   "memory destroyed at: %lx", addr);
+	return err;
+}
+
+unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+	return instruction_pointer(regs);
+}
+
+int __weak kgdb_arch_init(void)
+{
+	return 0;
+}
+
+int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+	return 0;
+}
+
+/**
+ *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
+ *	@regs: Current &struct pt_regs.
+ *
+ *	This function will be called if the particular architecture must
+ *	disable hardware debugging while it is processing gdb packets or
+ *	handling exception.
+ */
+void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
+{
+}
+
+/*
+ * Some architectures need cache flushes when we set/clear a
+ * breakpoint:
+ */
+static void kgdb_flush_swbreak_addr(unsigned long addr)
+{
+	if (!CACHE_FLUSH_IS_SAFE)
+		return;
+
+	if (current->mm && current->mm->mmap_cache) {
+		flush_cache_range(current->mm->mmap_cache,
+				  addr, addr + BREAK_INSTR_SIZE);
+	}
+	/* Force flush instruction cache if it was outside the mm */
+	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+}
+
+/*
+ * SW breakpoint management:
+ */
+int dbg_activate_sw_breakpoints(void)
+{
+	unsigned long addr;
+	int error;
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (kgdb_break[i].state != BP_SET)
+			continue;
+
+		addr = kgdb_break[i].bpt_addr;
+		error = kgdb_arch_set_breakpoint(addr,
+				kgdb_break[i].saved_instr);
+		if (error) {
+			ret = error;
+			printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+			continue;
+		}
+
+		kgdb_flush_swbreak_addr(addr);
+		kgdb_break[i].state = BP_ACTIVE;
+	}
+	return ret;
+}
+
+int dbg_set_sw_break(unsigned long addr)
+{
+	int err = kgdb_validate_break_address(addr);
+	int breakno = -1;
+	int i;
+
+	if (err)
+		return err;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if ((kgdb_break[i].state == BP_SET) &&
+					(kgdb_break[i].bpt_addr == addr))
+			return -EEXIST;
+	}
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (kgdb_break[i].state == BP_REMOVED &&
+					kgdb_break[i].bpt_addr == addr) {
+			breakno = i;
+			break;
+		}
+	}
+
+	if (breakno == -1) {
+		for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+			if (kgdb_break[i].state == BP_UNDEFINED) {
+				breakno = i;
+				break;
+			}
+		}
+	}
+
+	if (breakno == -1)
+		return -E2BIG;
+
+	kgdb_break[breakno].state = BP_SET;
+	kgdb_break[breakno].type = BP_BREAKPOINT;
+	kgdb_break[breakno].bpt_addr = addr;
+
+	return 0;
+}
+
+int dbg_deactivate_sw_breakpoints(void)
+{
+	unsigned long addr;
+	int error;
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (kgdb_break[i].state != BP_ACTIVE)
+			continue;
+		addr = kgdb_break[i].bpt_addr;
+		error = kgdb_arch_remove_breakpoint(addr,
+					kgdb_break[i].saved_instr);
+		if (error) {
+			printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+			ret = error;
+		}
+
+		kgdb_flush_swbreak_addr(addr);
+		kgdb_break[i].state = BP_SET;
+	}
+	return ret;
+}
+
+int dbg_remove_sw_break(unsigned long addr)
+{
+	int i;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if ((kgdb_break[i].state == BP_SET) &&
+				(kgdb_break[i].bpt_addr == addr)) {
+			kgdb_break[i].state = BP_REMOVED;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+int kgdb_isremovedbreak(unsigned long addr)
+{
+	int i;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if ((kgdb_break[i].state == BP_REMOVED) &&
+					(kgdb_break[i].bpt_addr == addr))
+			return 1;
+	}
+	return 0;
+}
+
+int dbg_remove_all_break(void)
+{
+	unsigned long addr;
+	int error;
+	int i;
+
+	/* Clear memory breakpoints. */
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (kgdb_break[i].state != BP_ACTIVE)
+			goto setundefined;
+		addr = kgdb_break[i].bpt_addr;
+		error = kgdb_arch_remove_breakpoint(addr,
+				kgdb_break[i].saved_instr);
+		if (error)
+			printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+			   addr);
+setundefined:
+		kgdb_break[i].state = BP_UNDEFINED;
+	}
+
+	/* Clear hardware breakpoints. */
+	if (arch_kgdb_ops.remove_all_hw_break)
+		arch_kgdb_ops.remove_all_hw_break();
+
+	return 0;
+}
+
+/*
+ * Return true if there is a valid kgdb I/O module.  Also if no
+ * debugger is attached a message can be printed to the console about
+ * waiting for the debugger to attach.
+ *
+ * The print_wait argument is only to be true when called from inside
+ * the core kgdb_handle_exception, because it will wait for the
+ * debugger to attach.
+ */
+static int kgdb_io_ready(int print_wait)
+{
+	if (!dbg_io_ops)
+		return 0;
+	if (kgdb_connected)
+		return 1;
+	if (atomic_read(&kgdb_setting_breakpoint))
+		return 1;
+	if (print_wait) {
+#ifdef CONFIG_KGDB_KDB
+		if (!dbg_kdb_mode)
+			printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+#else
+		printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+#endif
+	}
+	return 1;
+}
+
+static int kgdb_reenter_check(struct kgdb_state *ks)
+{
+	unsigned long addr;
+
+	if (atomic_read(&kgdb_active) != raw_smp_processor_id())
+		return 0;
+
+	/* Panic on recursive debugger calls: */
+	exception_level++;
+	addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+	dbg_deactivate_sw_breakpoints();
+
+	/*
+	 * If the break point removed ok at the place exception
+	 * occurred, try to recover and print a warning to the end
+	 * user because the user planted a breakpoint in a place that
+	 * KGDB needs in order to function.
+	 */
+	if (dbg_remove_sw_break(addr) == 0) {
+		exception_level = 0;
+		kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+		dbg_activate_sw_breakpoints();
+		printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
+			addr);
+		WARN_ON_ONCE(1);
+
+		return 1;
+	}
+	dbg_remove_all_break();
+	kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+
+	if (exception_level > 1) {
+		dump_stack();
+		panic("Recursive entry to debugger");
+	}
+
+	printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+#ifdef CONFIG_KGDB_KDB
+	/* Allow kdb to debug itself one level */
+	return 0;
+#endif
+	dump_stack();
+	panic("Recursive entry to debugger");
+
+	return 1;
+}
+
+static void dbg_cpu_switch(int cpu, int next_cpu)
+{
+	/* Mark the cpu we are switching away from as a slave when it
+	 * holds the kgdb_active token.  This must be done so that the
+	 * that all the cpus wait in for the debug core will not enter
+	 * again as the master. */
+	if (cpu == atomic_read(&kgdb_active)) {
+		kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+		kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
+	}
+	kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
+}
+
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
+{
+	unsigned long flags;
+	int sstep_tries = 100;
+	int error;
+	int i, cpu;
+	int trace_on = 0;
+acquirelock:
+	/*
+	 * Interrupts will be restored by the 'trap return' code, except when
+	 * single stepping.
+	 */
+	local_irq_save(flags);
+
+	cpu = ks->cpu;
+	kgdb_info[cpu].debuggerinfo = regs;
+	kgdb_info[cpu].task = current;
+	kgdb_info[cpu].ret_state = 0;
+	kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
+	/*
+	 * Make sure the above info reaches the primary CPU before
+	 * our cpu_in_kgdb[] flag setting does:
+	 */
+	atomic_inc(&cpu_in_kgdb[cpu]);
+
+	if (exception_level == 1)
+		goto cpu_master_loop;
+
+	/*
+	 * CPU will loop if it is a slave or request to become a kgdb
+	 * master cpu and acquire the kgdb_active lock:
+	 */
+	while (1) {
+cpu_loop:
+		if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
+			kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
+			goto cpu_master_loop;
+		} else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+			if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
+				break;
+		} else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
+			if (!atomic_read(&passive_cpu_wait[cpu]))
+				goto return_normal;
+		} else {
+return_normal:
+			/* Return to normal operation by executing any
+			 * hw breakpoint fixup.
+			 */
+			if (arch_kgdb_ops.correct_hw_break)
+				arch_kgdb_ops.correct_hw_break();
+			if (trace_on)
+				tracing_on();
+			atomic_dec(&cpu_in_kgdb[cpu]);
+			touch_softlockup_watchdog_sync();
+			clocksource_touch_watchdog();
+			local_irq_restore(flags);
+			return 0;
+		}
+		cpu_relax();
+	}
+
+	/*
+	 * For single stepping, try to only enter on the processor
+	 * that was single stepping.  To gaurd against a deadlock, the
+	 * kernel will only try for the value of sstep_tries before
+	 * giving up and continuing on.
+	 */
+	if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
+	    (kgdb_info[cpu].task &&
+	     kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
+		atomic_set(&kgdb_active, -1);
+		touch_softlockup_watchdog_sync();
+		clocksource_touch_watchdog();
+		local_irq_restore(flags);
+
+		goto acquirelock;
+	}
+
+	if (!kgdb_io_ready(1)) {
+		kgdb_info[cpu].ret_state = 1;
+		goto kgdb_restore; /* No I/O connection, resume the system */
+	}
+
+	/*
+	 * Don't enter if we have hit a removed breakpoint.
+	 */
+	if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
+		goto kgdb_restore;
+
+	/* Call the I/O driver's pre_exception routine */
+	if (dbg_io_ops->pre_exception)
+		dbg_io_ops->pre_exception();
+
+	kgdb_disable_hw_debug(ks->linux_regs);
+
+	/*
+	 * Get the passive CPU lock which will hold all the non-primary
+	 * CPU in a spin state while the debugger is active
+	 */
+	if (!kgdb_single_step) {
+		for (i = 0; i < NR_CPUS; i++)
+			atomic_inc(&passive_cpu_wait[i]);
+	}
+
+#ifdef CONFIG_SMP
+	/* Signal the other CPUs to enter kgdb_wait() */
+	if ((!kgdb_single_step) && kgdb_do_roundup)
+		kgdb_roundup_cpus(flags);
+#endif
+
+	/*
+	 * Wait for the other CPUs to be notified and be waiting for us:
+	 */
+	for_each_online_cpu(i) {
+		while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
+			cpu_relax();
+	}
+
+	/*
+	 * At this point the primary processor is completely
+	 * in the debugger and all secondary CPUs are quiescent
+	 */
+	dbg_deactivate_sw_breakpoints();
+	kgdb_single_step = 0;
+	kgdb_contthread = current;
+	exception_level = 0;
+	trace_on = tracing_is_on();
+	if (trace_on)
+		tracing_off();
+
+	while (1) {
+cpu_master_loop:
+		if (dbg_kdb_mode) {
+			kgdb_connected = 1;
+			error = kdb_stub(ks);
+			if (error == -1)
+				continue;
+			kgdb_connected = 0;
+		} else {
+			error = gdb_serial_stub(ks);
+		}
+
+		if (error == DBG_PASS_EVENT) {
+			dbg_kdb_mode = !dbg_kdb_mode;
+		} else if (error == DBG_SWITCH_CPU_EVENT) {
+			dbg_cpu_switch(cpu, dbg_switch_cpu);
+			goto cpu_loop;
+		} else {
+			kgdb_info[cpu].ret_state = error;
+			break;
+		}
+	}
+
+	/* Call the I/O driver's post_exception routine */
+	if (dbg_io_ops->post_exception)
+		dbg_io_ops->post_exception();
+
+	atomic_dec(&cpu_in_kgdb[ks->cpu]);
+
+	if (!kgdb_single_step) {
+		for (i = NR_CPUS-1; i >= 0; i--)
+			atomic_dec(&passive_cpu_wait[i]);
+		/*
+		 * Wait till all the CPUs have quit from the debugger,
+		 * but allow a CPU that hit an exception and is
+		 * waiting to become the master to remain in the debug
+		 * core.
+		 */
+		for_each_online_cpu(i) {
+			while (kgdb_do_roundup &&
+			       atomic_read(&cpu_in_kgdb[i]) &&
+			       !(kgdb_info[i].exception_state &
+				 DCPU_WANT_MASTER))
+				cpu_relax();
+		}
+	}
+
+kgdb_restore:
+	if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+		int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
+		if (kgdb_info[sstep_cpu].task)
+			kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
+		else
+			kgdb_sstep_pid = 0;
+	}
+	if (trace_on)
+		tracing_on();
+	/* Free kgdb_active */
+	atomic_set(&kgdb_active, -1);
+	touch_softlockup_watchdog_sync();
+	clocksource_touch_watchdog();
+	local_irq_restore(flags);
+
+	return kgdb_info[cpu].ret_state;
+}
+
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ *	interface locks, if any (begin_session)
+ *	kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+	struct kgdb_state kgdb_var;
+	struct kgdb_state *ks = &kgdb_var;
+	int ret;
+
+	ks->cpu			= raw_smp_processor_id();
+	ks->ex_vector		= evector;
+	ks->signo		= signo;
+	ks->err_code		= ecode;
+	ks->kgdb_usethreadid	= 0;
+	ks->linux_regs		= regs;
+
+	if (kgdb_reenter_check(ks))
+		return 0; /* Ouch, double exception ! */
+	kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
+	ret = kgdb_cpu_enter(ks, regs);
+	kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
+						DCPU_IS_SLAVE);
+	return ret;
+}
+
+int kgdb_nmicallback(int cpu, void *regs)
+{
+#ifdef CONFIG_SMP
+	struct kgdb_state kgdb_var;
+	struct kgdb_state *ks = &kgdb_var;
+
+	memset(ks, 0, sizeof(struct kgdb_state));
+	ks->cpu			= cpu;
+	ks->linux_regs		= regs;
+
+	if (!atomic_read(&cpu_in_kgdb[cpu]) &&
+	    atomic_read(&kgdb_active) != -1 &&
+	    atomic_read(&kgdb_active) != cpu) {
+		kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+		kgdb_cpu_enter(ks, regs);
+		kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
+		return 0;
+	}
+#endif
+	return 1;
+}
+
+static void kgdb_console_write(struct console *co, const char *s,
+   unsigned count)
+{
+	unsigned long flags;
+
+	/* If we're debugging, or KGDB has not connected, don't try
+	 * and print. */
+	if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
+		return;
+
+	local_irq_save(flags);
+	gdbstub_msg_write(s, count);
+	local_irq_restore(flags);
+}
+
+static struct console kgdbcons = {
+	.name		= "kgdb",
+	.write		= kgdb_console_write,
+	.flags		= CON_PRINTBUFFER | CON_ENABLED,
+	.index		= -1,
+};
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void sysrq_handle_dbg(int key)
+{
+	if (!dbg_io_ops) {
+		printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+		return;
+	}
+	if (!kgdb_connected) {
+#ifdef CONFIG_KGDB_KDB
+		if (!dbg_kdb_mode)
+			printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+#else
+		printk(KERN_CRIT "Entering KGDB\n");
+#endif
+	}
+
+	kgdb_breakpoint();
+}
+
+static struct sysrq_key_op sysrq_dbg_op = {
+	.handler	= sysrq_handle_dbg,
+	.help_msg	= "debug(G)",
+	.action_msg	= "DEBUG",
+};
+#endif
+
+static int kgdb_panic_event(struct notifier_block *self,
+			    unsigned long val,
+			    void *data)
+{
+	if (dbg_kdb_mode)
+		kdb_printf("PANIC: %s\n", (char *)data);
+	kgdb_breakpoint();
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block kgdb_panic_event_nb = {
+       .notifier_call	= kgdb_panic_event,
+       .priority	= INT_MAX,
+};
+
+void __weak kgdb_arch_late(void)
+{
+}
+
+void __init dbg_late_init(void)
+{
+	dbg_is_early = false;
+	if (kgdb_io_module_registered)
+		kgdb_arch_late();
+	kdb_init(KDB_INIT_FULL);
+}
+
+static void kgdb_register_callbacks(void)
+{
+	if (!kgdb_io_module_registered) {
+		kgdb_io_module_registered = 1;
+		kgdb_arch_init();
+		if (!dbg_is_early)
+			kgdb_arch_late();
+		atomic_notifier_chain_register(&panic_notifier_list,
+					       &kgdb_panic_event_nb);
+#ifdef CONFIG_MAGIC_SYSRQ
+		register_sysrq_key('g', &sysrq_dbg_op);
+#endif
+		if (kgdb_use_con && !kgdb_con_registered) {
+			register_console(&kgdbcons);
+			kgdb_con_registered = 1;
+		}
+	}
+}
+
+static void kgdb_unregister_callbacks(void)
+{
+	/*
+	 * When this routine is called KGDB should unregister from the
+	 * panic handler and clean up, making sure it is not handling any
+	 * break exceptions at the time.
+	 */
+	if (kgdb_io_module_registered) {
+		kgdb_io_module_registered = 0;
+		atomic_notifier_chain_unregister(&panic_notifier_list,
+					       &kgdb_panic_event_nb);
+		kgdb_arch_exit();
+#ifdef CONFIG_MAGIC_SYSRQ
+		unregister_sysrq_key('g', &sysrq_dbg_op);
+#endif
+		if (kgdb_con_registered) {
+			unregister_console(&kgdbcons);
+			kgdb_con_registered = 0;
+		}
+	}
+}
+
+/*
+ * There are times a tasklet needs to be used vs a compiled in
+ * break point so as to cause an exception outside a kgdb I/O module,
+ * such as is the case with kgdboe, where calling a breakpoint in the
+ * I/O driver itself would be fatal.
+ */
+static void kgdb_tasklet_bpt(unsigned long ing)
+{
+	kgdb_breakpoint();
+	atomic_set(&kgdb_break_tasklet_var, 0);
+}
+
+static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
+
+void kgdb_schedule_breakpoint(void)
+{
+	if (atomic_read(&kgdb_break_tasklet_var) ||
+		atomic_read(&kgdb_active) != -1 ||
+		atomic_read(&kgdb_setting_breakpoint))
+		return;
+	atomic_inc(&kgdb_break_tasklet_var);
+	tasklet_schedule(&kgdb_tasklet_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
+
+static void kgdb_initial_breakpoint(void)
+{
+	kgdb_break_asap = 0;
+
+	printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+	kgdb_breakpoint();
+}
+
+/**
+ *	kgdb_register_io_module - register KGDB IO module
+ *	@new_dbg_io_ops: the io ops vector
+ *
+ *	Register it with the KGDB core.
+ */
+int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
+{
+	int err;
+
+	spin_lock(&kgdb_registration_lock);
+
+	if (dbg_io_ops) {
+		spin_unlock(&kgdb_registration_lock);
+
+		printk(KERN_ERR "kgdb: Another I/O driver is already "
+				"registered with KGDB.\n");
+		return -EBUSY;
+	}
+
+	if (new_dbg_io_ops->init) {
+		err = new_dbg_io_ops->init();
+		if (err) {
+			spin_unlock(&kgdb_registration_lock);
+			return err;
+		}
+	}
+
+	dbg_io_ops = new_dbg_io_ops;
+
+	spin_unlock(&kgdb_registration_lock);
+
+	printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
+	       new_dbg_io_ops->name);
+
+	/* Arm KGDB now. */
+	kgdb_register_callbacks();
+
+	if (kgdb_break_asap)
+		kgdb_initial_breakpoint();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kgdb_register_io_module);
+
+/**
+ *	kkgdb_unregister_io_module - unregister KGDB IO module
+ *	@old_dbg_io_ops: the io ops vector
+ *
+ *	Unregister it with the KGDB core.
+ */
+void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
+{
+	BUG_ON(kgdb_connected);
+
+	/*
+	 * KGDB is no longer able to communicate out, so
+	 * unregister our callbacks and reset state.
+	 */
+	kgdb_unregister_callbacks();
+
+	spin_lock(&kgdb_registration_lock);
+
+	WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops);
+	dbg_io_ops = NULL;
+
+	spin_unlock(&kgdb_registration_lock);
+
+	printk(KERN_INFO
+		"kgdb: Unregistered I/O driver %s, debugger disabled.\n",
+		old_dbg_io_ops->name);
+}
+EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
+
+int dbg_io_get_char(void)
+{
+	int ret = dbg_io_ops->read_char();
+	if (ret == NO_POLL_CHAR)
+		return -1;
+	if (!dbg_kdb_mode)
+		return ret;
+	if (ret == 127)
+		return 8;
+	return ret;
+}
+
+/**
+ * kgdb_breakpoint - generate breakpoint exception
+ *
+ * This function will generate a breakpoint exception.  It is used at the
+ * beginning of a program to sync up with a debugger and can be used
+ * otherwise as a quick means to stop program execution and "break" into
+ * the debugger.
+ */
+void kgdb_breakpoint(void)
+{
+	atomic_inc(&kgdb_setting_breakpoint);
+	wmb(); /* Sync point before breakpoint */
+	arch_kgdb_breakpoint();
+	wmb(); /* Sync point after breakpoint */
+	atomic_dec(&kgdb_setting_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_breakpoint);
+
+static int __init opt_kgdb_wait(char *str)
+{
+	kgdb_break_asap = 1;
+
+	kdb_init(KDB_INIT_EARLY);
+	if (kgdb_io_module_registered)
+		kgdb_initial_breakpoint();
+
+	return 0;
+}
+
+early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
new file mode 100644
index 00000000000..c5d753d80f6
--- /dev/null
+++ b/kernel/debug/debug_core.h
@@ -0,0 +1,81 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _DEBUG_CORE_H_
+#define _DEBUG_CORE_H_
+/*
+ * These are the private implementation headers between the kernel
+ * debugger core and the debugger front end code.
+ */
+
+/* kernel debug core data structures */
+struct kgdb_state {
+	int			ex_vector;
+	int			signo;
+	int			err_code;
+	int			cpu;
+	int			pass_exception;
+	unsigned long		thr_query;
+	unsigned long		threadid;
+	long			kgdb_usethreadid;
+	struct pt_regs		*linux_regs;
+};
+
+/* Exception state values */
+#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
+#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
+#define DCPU_IS_SLAVE    0x4 /* Slave cpu enter exception */
+#define DCPU_SSTEP       0x8 /* CPU is single stepping */
+
+struct debuggerinfo_struct {
+	void			*debuggerinfo;
+	struct task_struct	*task;
+	int			exception_state;
+	int			ret_state;
+	int			irq_depth;
+};
+
+extern struct debuggerinfo_struct kgdb_info[];
+
+/* kernel debug core break point routines */
+extern int dbg_remove_all_break(void);
+extern int dbg_set_sw_break(unsigned long addr);
+extern int dbg_remove_sw_break(unsigned long addr);
+extern int dbg_activate_sw_breakpoints(void);
+extern int dbg_deactivate_sw_breakpoints(void);
+
+/* polled character access to i/o module */
+extern int dbg_io_get_char(void);
+
+/* stub return value for switching between the gdbstub and kdb */
+#define DBG_PASS_EVENT -12345
+/* Switch from one cpu to another */
+#define DBG_SWITCH_CPU_EVENT -123456
+extern int dbg_switch_cpu;
+
+/* gdbstub interface functions */
+extern int gdb_serial_stub(struct kgdb_state *ks);
+extern void gdbstub_msg_write(const char *s, int len);
+
+/* gdbstub functions used for kdb <-> gdbstub transition */
+extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
+extern int dbg_kdb_mode;
+
+#ifdef CONFIG_KGDB_KDB
+extern int kdb_stub(struct kgdb_state *ks);
+extern int kdb_parse(const char *cmdstr);
+#else /* ! CONFIG_KGDB_KDB */
+static inline int kdb_stub(struct kgdb_state *ks)
+{
+	return DBG_PASS_EVENT;
+}
+#endif /* CONFIG_KGDB_KDB */
+
+#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
new file mode 100644
index 00000000000..481a7bd2dfe
--- /dev/null
+++ b/kernel/debug/gdbstub.c
@@ -0,0 +1,1095 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ *  Jason Wessel ( jason.wessel@windriver.com )
+ *  George Anzinger <george@mvista.com>
+ *  Anurekh Saxena (anurekh.saxena@timesys.com)
+ *  Lake Stevens Instrument Division (Glenn Engel)
+ *  Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/reboot.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/unaligned.h>
+#include "debug_core.h"
+
+#define KGDB_MAX_THREAD_QUERY 17
+
+/* Our I/O buffers. */
+static char			remcom_in_buffer[BUFMAX];
+static char			remcom_out_buffer[BUFMAX];
+
+/* Storage for the registers, in GDB format. */
+static unsigned long		gdb_regs[(NUMREGBYTES +
+					sizeof(unsigned long) - 1) /
+					sizeof(unsigned long)];
+
+/*
+ * GDB remote protocol parser:
+ */
+
+#ifdef CONFIG_KGDB_KDB
+static int gdbstub_read_wait(void)
+{
+	int ret = -1;
+	int i;
+
+	/* poll any additional I/O interfaces that are defined */
+	while (ret < 0)
+		for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
+			ret = kdb_poll_funcs[i]();
+			if (ret > 0)
+				break;
+		}
+	return ret;
+}
+#else
+static int gdbstub_read_wait(void)
+{
+	int ret = dbg_io_ops->read_char();
+	while (ret == NO_POLL_CHAR)
+		ret = dbg_io_ops->read_char();
+	return ret;
+}
+#endif
+/* scan for the sequence $<data>#<checksum> */
+static void get_packet(char *buffer)
+{
+	unsigned char checksum;
+	unsigned char xmitcsum;
+	int count;
+	char ch;
+
+	do {
+		/*
+		 * Spin and wait around for the start character, ignore all
+		 * other characters:
+		 */
+		while ((ch = (gdbstub_read_wait())) != '$')
+			/* nothing */;
+
+		kgdb_connected = 1;
+		checksum = 0;
+		xmitcsum = -1;
+
+		count = 0;
+
+		/*
+		 * now, read until a # or end of buffer is found:
+		 */
+		while (count < (BUFMAX - 1)) {
+			ch = gdbstub_read_wait();
+			if (ch == '#')
+				break;
+			checksum = checksum + ch;
+			buffer[count] = ch;
+			count = count + 1;
+		}
+		buffer[count] = 0;
+
+		if (ch == '#') {
+			xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
+			xmitcsum += hex_to_bin(gdbstub_read_wait());
+
+			if (checksum != xmitcsum)
+				/* failed checksum */
+				dbg_io_ops->write_char('-');
+			else
+				/* successful transfer */
+				dbg_io_ops->write_char('+');
+			if (dbg_io_ops->flush)
+				dbg_io_ops->flush();
+		}
+	} while (checksum != xmitcsum);
+}
+
+/*
+ * Send the packet in buffer.
+ * Check for gdb connection if asked for.
+ */
+static void put_packet(char *buffer)
+{
+	unsigned char checksum;
+	int count;
+	char ch;
+
+	/*
+	 * $<packet info>#<checksum>.
+	 */
+	while (1) {
+		dbg_io_ops->write_char('$');
+		checksum = 0;
+		count = 0;
+
+		while ((ch = buffer[count])) {
+			dbg_io_ops->write_char(ch);
+			checksum += ch;
+			count++;
+		}
+
+		dbg_io_ops->write_char('#');
+		dbg_io_ops->write_char(hex_asc_hi(checksum));
+		dbg_io_ops->write_char(hex_asc_lo(checksum));
+		if (dbg_io_ops->flush)
+			dbg_io_ops->flush();
+
+		/* Now see what we get in reply. */
+		ch = gdbstub_read_wait();
+
+		if (ch == 3)
+			ch = gdbstub_read_wait();
+
+		/* If we get an ACK, we are done. */
+		if (ch == '+')
+			return;
+
+		/*
+		 * If we get the start of another packet, this means
+		 * that GDB is attempting to reconnect.  We will NAK
+		 * the packet being sent, and stop trying to send this
+		 * packet.
+		 */
+		if (ch == '$') {
+			dbg_io_ops->write_char('-');
+			if (dbg_io_ops->flush)
+				dbg_io_ops->flush();
+			return;
+		}
+	}
+}
+
+static char gdbmsgbuf[BUFMAX + 1];
+
+void gdbstub_msg_write(const char *s, int len)
+{
+	char *bufptr;
+	int wcount;
+	int i;
+
+	if (len == 0)
+		len = strlen(s);
+
+	/* 'O'utput */
+	gdbmsgbuf[0] = 'O';
+
+	/* Fill and send buffers... */
+	while (len > 0) {
+		bufptr = gdbmsgbuf + 1;
+
+		/* Calculate how many this time */
+		if ((len << 1) > (BUFMAX - 2))
+			wcount = (BUFMAX - 2) >> 1;
+		else
+			wcount = len;
+
+		/* Pack in hex chars */
+		for (i = 0; i < wcount; i++)
+			bufptr = pack_hex_byte(bufptr, s[i]);
+		*bufptr = '\0';
+
+		/* Move up */
+		s += wcount;
+		len -= wcount;
+
+		/* Write packet */
+		put_packet(gdbmsgbuf);
+	}
+}
+
+/*
+ * Convert the memory pointed to by mem into hex, placing result in
+ * buf.  Return a pointer to the last char put in buf (null). May
+ * return an error.
+ */
+char *kgdb_mem2hex(char *mem, char *buf, int count)
+{
+	char *tmp;
+	int err;
+
+	/*
+	 * We use the upper half of buf as an intermediate buffer for the
+	 * raw memory copy.  Hex conversion will work against this one.
+	 */
+	tmp = buf + count;
+
+	err = probe_kernel_read(tmp, mem, count);
+	if (err)
+		return NULL;
+	while (count > 0) {
+		buf = pack_hex_byte(buf, *tmp);
+		tmp++;
+		count--;
+	}
+	*buf = 0;
+
+	return buf;
+}
+
+/*
+ * Convert the hex array pointed to by buf into binary to be placed in
+ * mem.  Return a pointer to the character AFTER the last byte
+ * written.  May return an error.
+ */
+int kgdb_hex2mem(char *buf, char *mem, int count)
+{
+	char *tmp_raw;
+	char *tmp_hex;
+
+	/*
+	 * We use the upper half of buf as an intermediate buffer for the
+	 * raw memory that is converted from hex.
+	 */
+	tmp_raw = buf + count * 2;
+
+	tmp_hex = tmp_raw - 1;
+	while (tmp_hex >= buf) {
+		tmp_raw--;
+		*tmp_raw = hex_to_bin(*tmp_hex--);
+		*tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
+	}
+
+	return probe_kernel_write(mem, tmp_raw, count);
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
+{
+	int hex_val;
+	int num = 0;
+	int negate = 0;
+
+	*long_val = 0;
+
+	if (**ptr == '-') {
+		negate = 1;
+		(*ptr)++;
+	}
+	while (**ptr) {
+		hex_val = hex_to_bin(**ptr);
+		if (hex_val < 0)
+			break;
+
+		*long_val = (*long_val << 4) | hex_val;
+		num++;
+		(*ptr)++;
+	}
+
+	if (negate)
+		*long_val = -*long_val;
+
+	return num;
+}
+
+/*
+ * Copy the binary array pointed to by buf into mem.  Fix $, #, and
+ * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
+ * The input buf is overwitten with the result to write to mem.
+ */
+static int kgdb_ebin2mem(char *buf, char *mem, int count)
+{
+	int size = 0;
+	char *c = buf;
+
+	while (count-- > 0) {
+		c[size] = *buf++;
+		if (c[size] == 0x7d)
+			c[size] = *buf++ ^ 0x20;
+		size++;
+	}
+
+	return probe_kernel_write(mem, c, size);
+}
+
+#if DBG_MAX_REG_NUM > 0
+void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+	int i;
+	int idx = 0;
+	char *ptr = (char *)gdb_regs;
+
+	for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+		dbg_get_reg(i, ptr + idx, regs);
+		idx += dbg_reg_def[i].size;
+	}
+}
+
+void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+	int i;
+	int idx = 0;
+	char *ptr = (char *)gdb_regs;
+
+	for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+		dbg_set_reg(i, ptr + idx, regs);
+		idx += dbg_reg_def[i].size;
+	}
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
+/* Write memory due to an 'M' or 'X' packet. */
+static int write_mem_msg(int binary)
+{
+	char *ptr = &remcom_in_buffer[1];
+	unsigned long addr;
+	unsigned long length;
+	int err;
+
+	if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
+	    kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
+		if (binary)
+			err = kgdb_ebin2mem(ptr, (char *)addr, length);
+		else
+			err = kgdb_hex2mem(ptr, (char *)addr, length);
+		if (err)
+			return err;
+		if (CACHE_FLUSH_IS_SAFE)
+			flush_icache_range(addr, addr + length);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static void error_packet(char *pkt, int error)
+{
+	error = -error;
+	pkt[0] = 'E';
+	pkt[1] = hex_asc[(error / 10)];
+	pkt[2] = hex_asc[(error % 10)];
+	pkt[3] = '\0';
+}
+
+/*
+ * Thread ID accessors. We represent a flat TID space to GDB, where
+ * the per CPU idle threads (which under Linux all have PID 0) are
+ * remapped to negative TIDs.
+ */
+
+#define BUF_THREAD_ID_SIZE	8
+
+static char *pack_threadid(char *pkt, unsigned char *id)
+{
+	unsigned char *limit;
+	int lzero = 1;
+
+	limit = id + (BUF_THREAD_ID_SIZE / 2);
+	while (id < limit) {
+		if (!lzero || *id != 0) {
+			pkt = pack_hex_byte(pkt, *id);
+			lzero = 0;
+		}
+		id++;
+	}
+
+	if (lzero)
+		pkt = pack_hex_byte(pkt, 0);
+
+	return pkt;
+}
+
+static void int_to_threadref(unsigned char *id, int value)
+{
+	put_unaligned_be32(value, id);
+}
+
+static struct task_struct *getthread(struct pt_regs *regs, int tid)
+{
+	/*
+	 * Non-positive TIDs are remapped to the cpu shadow information
+	 */
+	if (tid == 0 || tid == -1)
+		tid = -atomic_read(&kgdb_active) - 2;
+	if (tid < -1 && tid > -NR_CPUS - 2) {
+		if (kgdb_info[-tid - 2].task)
+			return kgdb_info[-tid - 2].task;
+		else
+			return idle_task(-tid - 2);
+	}
+	if (tid <= 0) {
+		printk(KERN_ERR "KGDB: Internal thread select error\n");
+		dump_stack();
+		return NULL;
+	}
+
+	/*
+	 * find_task_by_pid_ns() does not take the tasklist lock anymore
+	 * but is nicely RCU locked - hence is a pretty resilient
+	 * thing to use:
+	 */
+	return find_task_by_pid_ns(tid, &init_pid_ns);
+}
+
+
+/*
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
+ */
+static inline int shadow_pid(int realpid)
+{
+	if (realpid)
+		return realpid;
+
+	return -raw_smp_processor_id() - 2;
+}
+
+/*
+ * All the functions that start with gdb_cmd are the various
+ * operations to implement the handlers for the gdbserial protocol
+ * where KGDB is communicating with an external debugger
+ */
+
+/* Handle the '?' status packets */
+static void gdb_cmd_status(struct kgdb_state *ks)
+{
+	/*
+	 * We know that this packet is only sent
+	 * during initial connect.  So to be safe,
+	 * we clear out our breakpoints now in case
+	 * GDB is reconnecting.
+	 */
+	dbg_remove_all_break();
+
+	remcom_out_buffer[0] = 'S';
+	pack_hex_byte(&remcom_out_buffer[1], ks->signo);
+}
+
+static void gdb_get_regs_helper(struct kgdb_state *ks)
+{
+	struct task_struct *thread;
+	void *local_debuggerinfo;
+	int i;
+
+	thread = kgdb_usethread;
+	if (!thread) {
+		thread = kgdb_info[ks->cpu].task;
+		local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
+	} else {
+		local_debuggerinfo = NULL;
+		for_each_online_cpu(i) {
+			/*
+			 * Try to find the task on some other
+			 * or possibly this node if we do not
+			 * find the matching task then we try
+			 * to approximate the results.
+			 */
+			if (thread == kgdb_info[i].task)
+				local_debuggerinfo = kgdb_info[i].debuggerinfo;
+		}
+	}
+
+	/*
+	 * All threads that don't have debuggerinfo should be
+	 * in schedule() sleeping, since all other CPUs
+	 * are in kgdb_wait, and thus have debuggerinfo.
+	 */
+	if (local_debuggerinfo) {
+		pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
+	} else {
+		/*
+		 * Pull stuff saved during switch_to; nothing
+		 * else is accessible (or even particularly
+		 * relevant).
+		 *
+		 * This should be enough for a stack trace.
+		 */
+		sleeping_thread_to_gdb_regs(gdb_regs, thread);
+	}
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+	gdb_get_regs_helper(ks);
+	kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
+}
+
+/* Handle the 'G' set registers request */
+static void gdb_cmd_setregs(struct kgdb_state *ks)
+{
+	kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
+
+	if (kgdb_usethread && kgdb_usethread != current) {
+		error_packet(remcom_out_buffer, -EINVAL);
+	} else {
+		gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
+		strcpy(remcom_out_buffer, "OK");
+	}
+}
+
+/* Handle the 'm' memory read bytes */
+static void gdb_cmd_memread(struct kgdb_state *ks)
+{
+	char *ptr = &remcom_in_buffer[1];
+	unsigned long length;
+	unsigned long addr;
+	char *err;
+
+	if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
+					kgdb_hex2long(&ptr, &length) > 0) {
+		err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
+		if (!err)
+			error_packet(remcom_out_buffer, -EINVAL);
+	} else {
+		error_packet(remcom_out_buffer, -EINVAL);
+	}
+}
+
+/* Handle the 'M' memory write bytes */
+static void gdb_cmd_memwrite(struct kgdb_state *ks)
+{
+	int err = write_mem_msg(0);
+
+	if (err)
+		error_packet(remcom_out_buffer, err);
+	else
+		strcpy(remcom_out_buffer, "OK");
+}
+
+#if DBG_MAX_REG_NUM > 0
+static char *gdb_hex_reg_helper(int regnum, char *out)
+{
+	int i;
+	int offset = 0;
+
+	for (i = 0; i < regnum; i++)
+		offset += dbg_reg_def[i].size;
+	return kgdb_mem2hex((char *)gdb_regs + offset, out,
+			    dbg_reg_def[i].size);
+}
+
+/* Handle the 'p' individual regster get */
+static void gdb_cmd_reg_get(struct kgdb_state *ks)
+{
+	unsigned long regnum;
+	char *ptr = &remcom_in_buffer[1];
+
+	kgdb_hex2long(&ptr, &regnum);
+	if (regnum >= DBG_MAX_REG_NUM) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	gdb_get_regs_helper(ks);
+	gdb_hex_reg_helper(regnum, remcom_out_buffer);
+}
+
+/* Handle the 'P' individual regster set */
+static void gdb_cmd_reg_set(struct kgdb_state *ks)
+{
+	unsigned long regnum;
+	char *ptr = &remcom_in_buffer[1];
+	int i = 0;
+
+	kgdb_hex2long(&ptr, &regnum);
+	if (*ptr++ != '=' ||
+	    !(!kgdb_usethread || kgdb_usethread == current) ||
+	    !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	memset(gdb_regs, 0, sizeof(gdb_regs));
+	while (i < sizeof(gdb_regs) * 2)
+		if (hex_to_bin(ptr[i]) >= 0)
+			i++;
+		else
+			break;
+	i = i / 2;
+	kgdb_hex2mem(ptr, (char *)gdb_regs, i);
+	dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
+	strcpy(remcom_out_buffer, "OK");
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
+/* Handle the 'X' memory binary write bytes */
+static void gdb_cmd_binwrite(struct kgdb_state *ks)
+{
+	int err = write_mem_msg(1);
+
+	if (err)
+		error_packet(remcom_out_buffer, err);
+	else
+		strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'D' or 'k', detach or kill packets */
+static void gdb_cmd_detachkill(struct kgdb_state *ks)
+{
+	int error;
+
+	/* The detach case */
+	if (remcom_in_buffer[0] == 'D') {
+		error = dbg_remove_all_break();
+		if (error < 0) {
+			error_packet(remcom_out_buffer, error);
+		} else {
+			strcpy(remcom_out_buffer, "OK");
+			kgdb_connected = 0;
+		}
+		put_packet(remcom_out_buffer);
+	} else {
+		/*
+		 * Assume the kill case, with no exit code checking,
+		 * trying to force detach the debugger:
+		 */
+		dbg_remove_all_break();
+		kgdb_connected = 0;
+	}
+}
+
+/* Handle the 'R' reboot packets */
+static int gdb_cmd_reboot(struct kgdb_state *ks)
+{
+	/* For now, only honor R0 */
+	if (strcmp(remcom_in_buffer, "R0") == 0) {
+		printk(KERN_CRIT "Executing emergency reboot\n");
+		strcpy(remcom_out_buffer, "OK");
+		put_packet(remcom_out_buffer);
+
+		/*
+		 * Execution should not return from
+		 * machine_emergency_restart()
+		 */
+		machine_emergency_restart();
+		kgdb_connected = 0;
+
+		return 1;
+	}
+	return 0;
+}
+
+/* Handle the 'q' query packets */
+static void gdb_cmd_query(struct kgdb_state *ks)
+{
+	struct task_struct *g;
+	struct task_struct *p;
+	unsigned char thref[BUF_THREAD_ID_SIZE];
+	char *ptr;
+	int i;
+	int cpu;
+	int finished = 0;
+
+	switch (remcom_in_buffer[1]) {
+	case 's':
+	case 'f':
+		if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10))
+			break;
+
+		i = 0;
+		remcom_out_buffer[0] = 'm';
+		ptr = remcom_out_buffer + 1;
+		if (remcom_in_buffer[1] == 'f') {
+			/* Each cpu is a shadow thread */
+			for_each_online_cpu(cpu) {
+				ks->thr_query = 0;
+				int_to_threadref(thref, -cpu - 2);
+				ptr = pack_threadid(ptr, thref);
+				*(ptr++) = ',';
+				i++;
+			}
+		}
+
+		do_each_thread(g, p) {
+			if (i >= ks->thr_query && !finished) {
+				int_to_threadref(thref, p->pid);
+				ptr = pack_threadid(ptr, thref);
+				*(ptr++) = ',';
+				ks->thr_query++;
+				if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+					finished = 1;
+			}
+			i++;
+		} while_each_thread(g, p);
+
+		*(--ptr) = '\0';
+		break;
+
+	case 'C':
+		/* Current thread id */
+		strcpy(remcom_out_buffer, "QC");
+		ks->threadid = shadow_pid(current->pid);
+		int_to_threadref(thref, ks->threadid);
+		pack_threadid(remcom_out_buffer + 2, thref);
+		break;
+	case 'T':
+		if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16))
+			break;
+
+		ks->threadid = 0;
+		ptr = remcom_in_buffer + 17;
+		kgdb_hex2long(&ptr, &ks->threadid);
+		if (!getthread(ks->linux_regs, ks->threadid)) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+		if ((int)ks->threadid > 0) {
+			kgdb_mem2hex(getthread(ks->linux_regs,
+					ks->threadid)->comm,
+					remcom_out_buffer, 16);
+		} else {
+			static char tmpstr[23 + BUF_THREAD_ID_SIZE];
+
+			sprintf(tmpstr, "shadowCPU%d",
+					(int)(-ks->threadid - 2));
+			kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
+		}
+		break;
+#ifdef CONFIG_KGDB_KDB
+	case 'R':
+		if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) {
+			int len = strlen(remcom_in_buffer + 6);
+
+			if ((len % 2) != 0) {
+				strcpy(remcom_out_buffer, "E01");
+				break;
+			}
+			kgdb_hex2mem(remcom_in_buffer + 6,
+				     remcom_out_buffer, len);
+			len = len / 2;
+			remcom_out_buffer[len++] = 0;
+
+			kdb_parse(remcom_out_buffer);
+			strcpy(remcom_out_buffer, "OK");
+		}
+		break;
+#endif
+	}
+}
+
+/* Handle the 'H' task query packets */
+static void gdb_cmd_task(struct kgdb_state *ks)
+{
+	struct task_struct *thread;
+	char *ptr;
+
+	switch (remcom_in_buffer[1]) {
+	case 'g':
+		ptr = &remcom_in_buffer[2];
+		kgdb_hex2long(&ptr, &ks->threadid);
+		thread = getthread(ks->linux_regs, ks->threadid);
+		if (!thread && ks->threadid > 0) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+		kgdb_usethread = thread;
+		ks->kgdb_usethreadid = ks->threadid;
+		strcpy(remcom_out_buffer, "OK");
+		break;
+	case 'c':
+		ptr = &remcom_in_buffer[2];
+		kgdb_hex2long(&ptr, &ks->threadid);
+		if (!ks->threadid) {
+			kgdb_contthread = NULL;
+		} else {
+			thread = getthread(ks->linux_regs, ks->threadid);
+			if (!thread && ks->threadid > 0) {
+				error_packet(remcom_out_buffer, -EINVAL);
+				break;
+			}
+			kgdb_contthread = thread;
+		}
+		strcpy(remcom_out_buffer, "OK");
+		break;
+	}
+}
+
+/* Handle the 'T' thread query packets */
+static void gdb_cmd_thread(struct kgdb_state *ks)
+{
+	char *ptr = &remcom_in_buffer[1];
+	struct task_struct *thread;
+
+	kgdb_hex2long(&ptr, &ks->threadid);
+	thread = getthread(ks->linux_regs, ks->threadid);
+	if (thread)
+		strcpy(remcom_out_buffer, "OK");
+	else
+		error_packet(remcom_out_buffer, -EINVAL);
+}
+
+/* Handle the 'z' or 'Z' breakpoint remove or set packets */
+static void gdb_cmd_break(struct kgdb_state *ks)
+{
+	/*
+	 * Since GDB-5.3, it's been drafted that '0' is a software
+	 * breakpoint, '1' is a hardware breakpoint, so let's do that.
+	 */
+	char *bpt_type = &remcom_in_buffer[1];
+	char *ptr = &remcom_in_buffer[2];
+	unsigned long addr;
+	unsigned long length;
+	int error = 0;
+
+	if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
+		/* Unsupported */
+		if (*bpt_type > '4')
+			return;
+	} else {
+		if (*bpt_type != '0' && *bpt_type != '1')
+			/* Unsupported. */
+			return;
+	}
+
+	/*
+	 * Test if this is a hardware breakpoint, and
+	 * if we support it:
+	 */
+	if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
+		/* Unsupported. */
+		return;
+
+	if (*(ptr++) != ',') {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	if (!kgdb_hex2long(&ptr, &addr)) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	if (*(ptr++) != ',' ||
+		!kgdb_hex2long(&ptr, &length)) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+
+	if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
+		error = dbg_set_sw_break(addr);
+	else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
+		error = dbg_remove_sw_break(addr);
+	else if (remcom_in_buffer[0] == 'Z')
+		error = arch_kgdb_ops.set_hw_breakpoint(addr,
+			(int)length, *bpt_type - '0');
+	else if (remcom_in_buffer[0] == 'z')
+		error = arch_kgdb_ops.remove_hw_breakpoint(addr,
+			(int) length, *bpt_type - '0');
+
+	if (error == 0)
+		strcpy(remcom_out_buffer, "OK");
+	else
+		error_packet(remcom_out_buffer, error);
+}
+
+/* Handle the 'C' signal / exception passing packets */
+static int gdb_cmd_exception_pass(struct kgdb_state *ks)
+{
+	/* C09 == pass exception
+	 * C15 == detach kgdb, pass exception
+	 */
+	if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
+
+		ks->pass_exception = 1;
+		remcom_in_buffer[0] = 'c';
+
+	} else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
+
+		ks->pass_exception = 1;
+		remcom_in_buffer[0] = 'D';
+		dbg_remove_all_break();
+		kgdb_connected = 0;
+		return 1;
+
+	} else {
+		gdbstub_msg_write("KGDB only knows signal 9 (pass)"
+			" and 15 (pass and disconnect)\n"
+			"Executing a continue without signal passing\n", 0);
+		remcom_in_buffer[0] = 'c';
+	}
+
+	/* Indicate fall through */
+	return -1;
+}
+
+/*
+ * This function performs all gdbserial command procesing
+ */
+int gdb_serial_stub(struct kgdb_state *ks)
+{
+	int error = 0;
+	int tmp;
+
+	/* Initialize comm buffer and globals. */
+	memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+	kgdb_usethread = kgdb_info[ks->cpu].task;
+	ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+	ks->pass_exception = 0;
+
+	if (kgdb_connected) {
+		unsigned char thref[BUF_THREAD_ID_SIZE];
+		char *ptr;
+
+		/* Reply to host that an exception has occurred */
+		ptr = remcom_out_buffer;
+		*ptr++ = 'T';
+		ptr = pack_hex_byte(ptr, ks->signo);
+		ptr += strlen(strcpy(ptr, "thread:"));
+		int_to_threadref(thref, shadow_pid(current->pid));
+		ptr = pack_threadid(ptr, thref);
+		*ptr++ = ';';
+		put_packet(remcom_out_buffer);
+	}
+
+	while (1) {
+		error = 0;
+
+		/* Clear the out buffer. */
+		memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+		get_packet(remcom_in_buffer);
+
+		switch (remcom_in_buffer[0]) {
+		case '?': /* gdbserial status */
+			gdb_cmd_status(ks);
+			break;
+		case 'g': /* return the value of the CPU registers */
+			gdb_cmd_getregs(ks);
+			break;
+		case 'G': /* set the value of the CPU registers - return OK */
+			gdb_cmd_setregs(ks);
+			break;
+		case 'm': /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
+			gdb_cmd_memread(ks);
+			break;
+		case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+			gdb_cmd_memwrite(ks);
+			break;
+#if DBG_MAX_REG_NUM > 0
+		case 'p': /* pXX Return gdb register XX (in hex) */
+			gdb_cmd_reg_get(ks);
+			break;
+		case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
+			gdb_cmd_reg_set(ks);
+			break;
+#endif /* DBG_MAX_REG_NUM > 0 */
+		case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+			gdb_cmd_binwrite(ks);
+			break;
+			/* kill or detach. KGDB should treat this like a
+			 * continue.
+			 */
+		case 'D': /* Debugger detach */
+		case 'k': /* Debugger detach via kill */
+			gdb_cmd_detachkill(ks);
+			goto default_handle;
+		case 'R': /* Reboot */
+			if (gdb_cmd_reboot(ks))
+				goto default_handle;
+			break;
+		case 'q': /* query command */
+			gdb_cmd_query(ks);
+			break;
+		case 'H': /* task related */
+			gdb_cmd_task(ks);
+			break;
+		case 'T': /* Query thread status */
+			gdb_cmd_thread(ks);
+			break;
+		case 'z': /* Break point remove */
+		case 'Z': /* Break point set */
+			gdb_cmd_break(ks);
+			break;
+#ifdef CONFIG_KGDB_KDB
+		case '3': /* Escape into back into kdb */
+			if (remcom_in_buffer[1] == '\0') {
+				gdb_cmd_detachkill(ks);
+				return DBG_PASS_EVENT;
+			}
+#endif
+		case 'C': /* Exception passing */
+			tmp = gdb_cmd_exception_pass(ks);
+			if (tmp > 0)
+				goto default_handle;
+			if (tmp == 0)
+				break;
+			/* Fall through on tmp < 0 */
+		case 'c': /* Continue packet */
+		case 's': /* Single step packet */
+			if (kgdb_contthread && kgdb_contthread != current) {
+				/* Can't switch threads in kgdb */
+				error_packet(remcom_out_buffer, -EINVAL);
+				break;
+			}
+			dbg_activate_sw_breakpoints();
+			/* Fall through to default processing */
+		default:
+default_handle:
+			error = kgdb_arch_handle_exception(ks->ex_vector,
+						ks->signo,
+						ks->err_code,
+						remcom_in_buffer,
+						remcom_out_buffer,
+						ks->linux_regs);
+			/*
+			 * Leave cmd processing on error, detach,
+			 * kill, continue, or single step.
+			 */
+			if (error >= 0 || remcom_in_buffer[0] == 'D' ||
+			    remcom_in_buffer[0] == 'k') {
+				error = 0;
+				goto kgdb_exit;
+			}
+
+		}
+
+		/* reply to the request */
+		put_packet(remcom_out_buffer);
+	}
+
+kgdb_exit:
+	if (ks->pass_exception)
+		error = 1;
+	return error;
+}
+
+int gdbstub_state(struct kgdb_state *ks, char *cmd)
+{
+	int error;
+
+	switch (cmd[0]) {
+	case 'e':
+		error = kgdb_arch_handle_exception(ks->ex_vector,
+						   ks->signo,
+						   ks->err_code,
+						   remcom_in_buffer,
+						   remcom_out_buffer,
+						   ks->linux_regs);
+		return error;
+	case 's':
+	case 'c':
+		strcpy(remcom_in_buffer, cmd);
+		return 0;
+	case '?':
+		gdb_cmd_status(ks);
+		break;
+	case '\0':
+		strcpy(remcom_out_buffer, "");
+		break;
+	}
+	dbg_io_ops->write_char('+');
+	put_packet(remcom_out_buffer);
+	return 0;
+}
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
new file mode 100644
index 00000000000..396d12eda9e
--- /dev/null
+++ b/kernel/debug/kdb/.gitignore
@@ -0,0 +1 @@
+gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
new file mode 100644
index 00000000000..d4fc58f4b88
--- /dev/null
+++ b/kernel/debug/kdb/Makefile
@@ -0,0 +1,25 @@
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (c) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+#
+
+CCVERSION	:= $(shell $(CC) -v 2>&1 | sed -ne '$$p')
+obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
+obj-$(CONFIG_KDB_KEYBOARD)    += kdb_keyboard.o
+
+clean-files := gen-kdb_cmds.c
+
+quiet_cmd_gen-kdb = GENKDB  $@
+      cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
+		/^\#/{next} \
+		/^[ \t]*$$/{next} \
+		{gsub(/"/, "\\\"", $$0); \
+		  print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
+		END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print "  kdb_cmd" i ","}; print("  NULL\n};");}' \
+		$(filter-out %/Makefile,$^) > $@#
+
+$(obj)/gen-kdb_cmds.c:	$(src)/kdb_cmds $(src)/Makefile
+	$(call cmd,gen-kdb)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
new file mode 100644
index 00000000000..20059ef4459
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -0,0 +1,562 @@
+/*
+ * Kernel Debugger Architecture Independent Breakpoint Handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdb.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include "kdb_private.h"
+
+/*
+ * Table of kdb_breakpoints
+ */
+kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
+
+static void kdb_setsinglestep(struct pt_regs *regs)
+{
+	KDB_STATE_SET(DOING_SS);
+}
+
+static char *kdb_rwtypes[] = {
+	"Instruction(i)",
+	"Instruction(Register)",
+	"Data Write",
+	"I/O",
+	"Data Access"
+};
+
+static char *kdb_bptype(kdb_bp_t *bp)
+{
+	if (bp->bp_type < 0 || bp->bp_type > 4)
+		return "";
+
+	return kdb_rwtypes[bp->bp_type];
+}
+
+static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
+{
+	int nextarg = *nextargp;
+	int diag;
+
+	bp->bph_length = 1;
+	if ((argc + 1) != nextarg) {
+		if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
+			bp->bp_type = BP_ACCESS_WATCHPOINT;
+		else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
+			bp->bp_type = BP_WRITE_WATCHPOINT;
+		else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
+			bp->bp_type = BP_HARDWARE_BREAKPOINT;
+		else
+			return KDB_ARGCOUNT;
+
+		bp->bph_length = 1;
+
+		nextarg++;
+
+		if ((argc + 1) != nextarg) {
+			unsigned long len;
+
+			diag = kdbgetularg((char *)argv[nextarg],
+					   &len);
+			if (diag)
+				return diag;
+
+
+			if (len > 8)
+				return KDB_BADLENGTH;
+
+			bp->bph_length = len;
+			nextarg++;
+		}
+
+		if ((argc + 1) != nextarg)
+			return KDB_ARGCOUNT;
+	}
+
+	*nextargp = nextarg;
+	return 0;
+}
+
+static int _kdb_bp_remove(kdb_bp_t *bp)
+{
+	int ret = 1;
+	if (!bp->bp_installed)
+		return ret;
+	if (!bp->bp_type)
+		ret = dbg_remove_sw_break(bp->bp_addr);
+	else
+		ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
+			 bp->bph_length,
+			 bp->bp_type);
+	if (ret == 0)
+		bp->bp_installed = 0;
+	return ret;
+}
+
+static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
+{
+	if (KDB_DEBUG(BP))
+		kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
+
+	/*
+	 * Setup single step
+	 */
+	kdb_setsinglestep(regs);
+
+	/*
+	 * Reset delay attribute
+	 */
+	bp->bp_delay = 0;
+	bp->bp_delayed = 1;
+}
+
+static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
+{
+	int ret;
+	/*
+	 * Install the breakpoint, if it is not already installed.
+	 */
+
+	if (KDB_DEBUG(BP))
+		kdb_printf("%s: bp_installed %d\n",
+			   __func__, bp->bp_installed);
+	if (!KDB_STATE(SSBPT))
+		bp->bp_delay = 0;
+	if (bp->bp_installed)
+		return 1;
+	if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
+		if (KDB_DEBUG(BP))
+			kdb_printf("%s: delayed bp\n", __func__);
+		kdb_handle_bp(regs, bp);
+		return 0;
+	}
+	if (!bp->bp_type)
+		ret = dbg_set_sw_break(bp->bp_addr);
+	else
+		ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
+			 bp->bph_length,
+			 bp->bp_type);
+	if (ret == 0) {
+		bp->bp_installed = 1;
+	} else {
+		kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
+			   __func__, bp->bp_addr);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * kdb_bp_install
+ *
+ *	Install kdb_breakpoints prior to returning from the
+ *	kernel debugger.  This allows the kdb_breakpoints to be set
+ *	upon functions that are used internally by kdb, such as
+ *	printk().  This function is only called once per kdb session.
+ */
+void kdb_bp_install(struct pt_regs *regs)
+{
+	int i;
+
+	for (i = 0; i < KDB_MAXBPT; i++) {
+		kdb_bp_t *bp = &kdb_breakpoints[i];
+
+		if (KDB_DEBUG(BP)) {
+			kdb_printf("%s: bp %d bp_enabled %d\n",
+				   __func__, i, bp->bp_enabled);
+		}
+		if (bp->bp_enabled)
+			_kdb_bp_install(regs, bp);
+	}
+}
+
+/*
+ * kdb_bp_remove
+ *
+ *	Remove kdb_breakpoints upon entry to the kernel debugger.
+ *
+ * Parameters:
+ *	None.
+ * Outputs:
+ *	None.
+ * Returns:
+ *	None.
+ * Locking:
+ *	None.
+ * Remarks:
+ */
+void kdb_bp_remove(void)
+{
+	int i;
+
+	for (i = KDB_MAXBPT - 1; i >= 0; i--) {
+		kdb_bp_t *bp = &kdb_breakpoints[i];
+
+		if (KDB_DEBUG(BP)) {
+			kdb_printf("%s: bp %d bp_enabled %d\n",
+				   __func__, i, bp->bp_enabled);
+		}
+		if (bp->bp_enabled)
+			_kdb_bp_remove(bp);
+	}
+}
+
+
+/*
+ * kdb_printbp
+ *
+ *	Internal function to format and print a breakpoint entry.
+ *
+ * Parameters:
+ *	None.
+ * Outputs:
+ *	None.
+ * Returns:
+ *	None.
+ * Locking:
+ *	None.
+ * Remarks:
+ */
+
+static void kdb_printbp(kdb_bp_t *bp, int i)
+{
+	kdb_printf("%s ", kdb_bptype(bp));
+	kdb_printf("BP #%d at ", i);
+	kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
+
+	if (bp->bp_enabled)
+		kdb_printf("\n    is enabled");
+	else
+		kdb_printf("\n    is disabled");
+
+	kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
+		   bp->bp_addr, bp->bp_type, bp->bp_installed);
+
+	kdb_printf("\n");
+}
+
+/*
+ * kdb_bp
+ *
+ *	Handle the bp commands.
+ *
+ *	[bp|bph] <addr-expression> [DATAR|DATAW]
+ *
+ * Parameters:
+ *	argc	Count of arguments in argv
+ *	argv	Space delimited command line arguments
+ * Outputs:
+ *	None.
+ * Returns:
+ *	Zero for success, a kdb diagnostic if failure.
+ * Locking:
+ *	None.
+ * Remarks:
+ *
+ *	bp	Set breakpoint on all cpus.  Only use hardware assist if need.
+ *	bph	Set breakpoint on all cpus.  Force hardware register
+ */
+
+static int kdb_bp(int argc, const char **argv)
+{
+	int i, bpno;
+	kdb_bp_t *bp, *bp_check;
+	int diag;
+	char *symname = NULL;
+	long offset = 0ul;
+	int nextarg;
+	kdb_bp_t template = {0};
+
+	if (argc == 0) {
+		/*
+		 * Display breakpoint table
+		 */
+		for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
+		     bpno++, bp++) {
+			if (bp->bp_free)
+				continue;
+			kdb_printbp(bp, bpno);
+		}
+
+		return 0;
+	}
+
+	nextarg = 1;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
+			     &offset, &symname);
+	if (diag)
+		return diag;
+	if (!template.bp_addr)
+		return KDB_BADINT;
+
+	/*
+	 * Find an empty bp structure to allocate
+	 */
+	for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
+		if (bp->bp_free)
+			break;
+	}
+
+	if (bpno == KDB_MAXBPT)
+		return KDB_TOOMANYBPT;
+
+	if (strcmp(argv[0], "bph") == 0) {
+		template.bp_type = BP_HARDWARE_BREAKPOINT;
+		diag = kdb_parsebp(argc, argv, &nextarg, &template);
+		if (diag)
+			return diag;
+	} else {
+		template.bp_type = BP_BREAKPOINT;
+	}
+
+	/*
+	 * Check for clashing breakpoints.
+	 *
+	 * Note, in this design we can't have hardware breakpoints
+	 * enabled for both read and write on the same address.
+	 */
+	for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
+	     i++, bp_check++) {
+		if (!bp_check->bp_free &&
+		    bp_check->bp_addr == template.bp_addr) {
+			kdb_printf("You already have a breakpoint at "
+				   kdb_bfd_vma_fmt0 "\n", template.bp_addr);
+			return KDB_DUPBPT;
+		}
+	}
+
+	template.bp_enabled = 1;
+
+	/*
+	 * Actually allocate the breakpoint found earlier
+	 */
+	*bp = template;
+	bp->bp_free = 0;
+
+	kdb_printbp(bp, bpno);
+
+	return 0;
+}
+
+/*
+ * kdb_bc
+ *
+ *	Handles the 'bc', 'be', and 'bd' commands
+ *
+ *	[bd|bc|be] <breakpoint-number>
+ *	[bd|bc|be] *
+ *
+ * Parameters:
+ *	argc	Count of arguments in argv
+ *	argv	Space delimited command line arguments
+ * Outputs:
+ *	None.
+ * Returns:
+ *	Zero for success, a kdb diagnostic for failure
+ * Locking:
+ *	None.
+ * Remarks:
+ */
+static int kdb_bc(int argc, const char **argv)
+{
+	unsigned long addr;
+	kdb_bp_t *bp = NULL;
+	int lowbp = KDB_MAXBPT;
+	int highbp = 0;
+	int done = 0;
+	int i;
+	int diag = 0;
+
+	int cmd;			/* KDBCMD_B? */
+#define KDBCMD_BC	0
+#define KDBCMD_BE	1
+#define KDBCMD_BD	2
+
+	if (strcmp(argv[0], "be") == 0)
+		cmd = KDBCMD_BE;
+	else if (strcmp(argv[0], "bd") == 0)
+		cmd = KDBCMD_BD;
+	else
+		cmd = KDBCMD_BC;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	if (strcmp(argv[1], "*") == 0) {
+		lowbp = 0;
+		highbp = KDB_MAXBPT;
+	} else {
+		diag = kdbgetularg(argv[1], &addr);
+		if (diag)
+			return diag;
+
+		/*
+		 * For addresses less than the maximum breakpoint number,
+		 * assume that the breakpoint number is desired.
+		 */
+		if (addr < KDB_MAXBPT) {
+			bp = &kdb_breakpoints[addr];
+			lowbp = highbp = addr;
+			highbp++;
+		} else {
+			for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
+			    i++, bp++) {
+				if (bp->bp_addr == addr) {
+					lowbp = highbp = i;
+					highbp++;
+					break;
+				}
+			}
+		}
+	}
+
+	/*
+	 * Now operate on the set of breakpoints matching the input
+	 * criteria (either '*' for all, or an individual breakpoint).
+	 */
+	for (bp = &kdb_breakpoints[lowbp], i = lowbp;
+	    i < highbp;
+	    i++, bp++) {
+		if (bp->bp_free)
+			continue;
+
+		done++;
+
+		switch (cmd) {
+		case KDBCMD_BC:
+			bp->bp_enabled = 0;
+
+			kdb_printf("Breakpoint %d at "
+				   kdb_bfd_vma_fmt " cleared\n",
+				   i, bp->bp_addr);
+
+			bp->bp_addr = 0;
+			bp->bp_free = 1;
+
+			break;
+		case KDBCMD_BE:
+			bp->bp_enabled = 1;
+
+			kdb_printf("Breakpoint %d at "
+				   kdb_bfd_vma_fmt " enabled",
+				   i, bp->bp_addr);
+
+			kdb_printf("\n");
+			break;
+		case KDBCMD_BD:
+			if (!bp->bp_enabled)
+				break;
+
+			bp->bp_enabled = 0;
+
+			kdb_printf("Breakpoint %d at "
+				   kdb_bfd_vma_fmt " disabled\n",
+				   i, bp->bp_addr);
+
+			break;
+		}
+		if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
+			bp->bp_delay = 0;
+			KDB_STATE_CLEAR(SSBPT);
+		}
+	}
+
+	return (!done) ? KDB_BPTNOTFOUND : 0;
+}
+
+/*
+ * kdb_ss
+ *
+ *	Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
+ *	commands.
+ *
+ *	ss
+ *	ssb
+ *
+ * Parameters:
+ *	argc	Argument count
+ *	argv	Argument vector
+ * Outputs:
+ *	None.
+ * Returns:
+ *	KDB_CMD_SS[B] for success, a kdb error if failure.
+ * Locking:
+ *	None.
+ * Remarks:
+ *
+ *	Set the arch specific option to trigger a debug trap after the next
+ *	instruction.
+ *
+ *	For 'ssb', set the trace flag in the debug trap handler
+ *	after printing the current insn and return directly without
+ *	invoking the kdb command processor, until a branch instruction
+ *	is encountered.
+ */
+
+static int kdb_ss(int argc, const char **argv)
+{
+	int ssb = 0;
+
+	ssb = (strcmp(argv[0], "ssb") == 0);
+	if (argc != 0)
+		return KDB_ARGCOUNT;
+	/*
+	 * Set trace flag and go.
+	 */
+	KDB_STATE_SET(DOING_SS);
+	if (ssb) {
+		KDB_STATE_SET(DOING_SSB);
+		return KDB_CMD_SSB;
+	}
+	return KDB_CMD_SS;
+}
+
+/* Initialize the breakpoint table and register	breakpoint commands. */
+
+void __init kdb_initbptab(void)
+{
+	int i;
+	kdb_bp_t *bp;
+
+	/*
+	 * First time initialization.
+	 */
+	memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
+
+	for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
+		bp->bp_free = 1;
+
+	kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
+		"Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
+		"Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+	if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
+		kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
+		"[datar [length]|dataw [length]]   Set hw brk", 0, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("bc", kdb_bc, "<bpnum>",
+		"Clear Breakpoint", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("be", kdb_bc, "<bpnum>",
+		"Enable Breakpoint", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("bd", kdb_bc, "<bpnum>",
+		"Disable Breakpoint", 0, KDB_REPEAT_NONE);
+
+	kdb_register_repeat("ss", kdb_ss, "",
+		"Single Step", 1, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("ssb", kdb_ss, "",
+		"Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
+	/*
+	 * Architecture dependent initialization.
+	 */
+}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
new file mode 100644
index 00000000000..2f62fe85f16
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -0,0 +1,210 @@
+/*
+ * Kernel Debugger Architecture Independent Stack Traceback
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/kdb.h>
+#include <linux/nmi.h>
+#include <asm/system.h>
+#include "kdb_private.h"
+
+
+static void kdb_show_stack(struct task_struct *p, void *addr)
+{
+	int old_lvl = console_loglevel;
+	console_loglevel = 15;
+	kdb_trap_printk++;
+	kdb_set_current_task(p);
+	if (addr) {
+		show_stack((struct task_struct *)p, addr);
+	} else if (kdb_current_regs) {
+#ifdef CONFIG_X86
+		show_stack(p, &kdb_current_regs->sp);
+#else
+		show_stack(p, NULL);
+#endif
+	} else {
+		show_stack(p, NULL);
+	}
+	console_loglevel = old_lvl;
+	kdb_trap_printk--;
+}
+
+/*
+ * kdb_bt
+ *
+ *	This function implements the 'bt' command.  Print a stack
+ *	traceback.
+ *
+ *	bt [<address-expression>]	(addr-exp is for alternate stacks)
+ *	btp <pid>			Kernel stack for <pid>
+ *	btt <address-expression>	Kernel stack for task structure at
+ *					<address-expression>
+ *	bta [DRSTCZEUIMA]		All useful processes, optionally
+ *					filtered by state
+ *	btc [<cpu>]			The current process on one cpu,
+ *					default is all cpus
+ *
+ *	bt <address-expression> refers to a address on the stack, that location
+ *	is assumed to contain a return address.
+ *
+ *	btt <address-expression> refers to the address of a struct task.
+ *
+ * Inputs:
+ *	argc	argument count
+ *	argv	argument vector
+ * Outputs:
+ *	None.
+ * Returns:
+ *	zero for success, a kdb diagnostic if error
+ * Locking:
+ *	none.
+ * Remarks:
+ *	Backtrack works best when the code uses frame pointers.  But even
+ *	without frame pointers we should get a reasonable trace.
+ *
+ *	mds comes in handy when examining the stack to do a manual traceback or
+ *	to get a starting point for bt <address-expression>.
+ */
+
+static int
+kdb_bt1(struct task_struct *p, unsigned long mask,
+	int argcount, int btaprompt)
+{
+	char buffer[2];
+	if (kdb_getarea(buffer[0], (unsigned long)p) ||
+	    kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
+		return KDB_BADADDR;
+	if (!kdb_task_state(p, mask))
+		return 0;
+	kdb_printf("Stack traceback for pid %d\n", p->pid);
+	kdb_ps1(p);
+	kdb_show_stack(p, NULL);
+	if (btaprompt) {
+		kdb_getstr(buffer, sizeof(buffer),
+			   "Enter <q> to end, <cr> to continue:");
+		if (buffer[0] == 'q') {
+			kdb_printf("\n");
+			return 1;
+		}
+	}
+	touch_nmi_watchdog();
+	return 0;
+}
+
+int
+kdb_bt(int argc, const char **argv)
+{
+	int diag;
+	int argcount = 5;
+	int btaprompt = 1;
+	int nextarg;
+	unsigned long addr;
+	long offset;
+
+	kdbgetintenv("BTARGS", &argcount);	/* Arguments to print */
+	kdbgetintenv("BTAPROMPT", &btaprompt);	/* Prompt after each
+						 * proc in bta */
+
+	if (strcmp(argv[0], "bta") == 0) {
+		struct task_struct *g, *p;
+		unsigned long cpu;
+		unsigned long mask = kdb_task_state_string(argc ? argv[1] :
+							   NULL);
+		if (argc == 0)
+			kdb_ps_suppressed();
+		/* Run the active tasks first */
+		for_each_online_cpu(cpu) {
+			p = kdb_curr_task(cpu);
+			if (kdb_bt1(p, mask, argcount, btaprompt))
+				return 0;
+		}
+		/* Now the inactive tasks */
+		kdb_do_each_thread(g, p) {
+			if (task_curr(p))
+				continue;
+			if (kdb_bt1(p, mask, argcount, btaprompt))
+				return 0;
+		} kdb_while_each_thread(g, p);
+	} else if (strcmp(argv[0], "btp") == 0) {
+		struct task_struct *p;
+		unsigned long pid;
+		if (argc != 1)
+			return KDB_ARGCOUNT;
+		diag = kdbgetularg((char *)argv[1], &pid);
+		if (diag)
+			return diag;
+		p = find_task_by_pid_ns(pid, &init_pid_ns);
+		if (p) {
+			kdb_set_current_task(p);
+			return kdb_bt1(p, ~0UL, argcount, 0);
+		}
+		kdb_printf("No process with pid == %ld found\n", pid);
+		return 0;
+	} else if (strcmp(argv[0], "btt") == 0) {
+		if (argc != 1)
+			return KDB_ARGCOUNT;
+		diag = kdbgetularg((char *)argv[1], &addr);
+		if (diag)
+			return diag;
+		kdb_set_current_task((struct task_struct *)addr);
+		return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
+	} else if (strcmp(argv[0], "btc") == 0) {
+		unsigned long cpu = ~0;
+		struct task_struct *save_current_task = kdb_current_task;
+		char buf[80];
+		if (argc > 1)
+			return KDB_ARGCOUNT;
+		if (argc == 1) {
+			diag = kdbgetularg((char *)argv[1], &cpu);
+			if (diag)
+				return diag;
+		}
+		/* Recursive use of kdb_parse, do not use argv after
+		 * this point */
+		argv = NULL;
+		if (cpu != ~0) {
+			if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
+				kdb_printf("no process for cpu %ld\n", cpu);
+				return 0;
+			}
+			sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+			kdb_parse(buf);
+			return 0;
+		}
+		kdb_printf("btc: cpu status: ");
+		kdb_parse("cpu\n");
+		for_each_online_cpu(cpu) {
+			sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+			kdb_parse(buf);
+			touch_nmi_watchdog();
+		}
+		kdb_set_current_task(save_current_task);
+		return 0;
+	} else {
+		if (argc) {
+			nextarg = 1;
+			diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+					     &offset, NULL);
+			if (diag)
+				return diag;
+			kdb_show_stack(kdb_current_task, (void *)addr);
+			return 0;
+		} else {
+			return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
+		}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
new file mode 100644
index 00000000000..56c88e4db30
--- /dev/null
+++ b/kernel/debug/kdb/kdb_cmds
@@ -0,0 +1,35 @@
+# Initial commands for kdb, alter to suit your needs.
+# These commands are executed in kdb_init() context, no SMP, no
+# processes.  Commands that require process data (including stack or
+# registers) are not reliable this early.  set and bp commands should
+# be safe.  Global breakpoint commands affect each cpu as it is booted.
+
+# Standard debugging information for first level support, just type archkdb
+# or archkdbcpu or archkdbshort at the kdb prompt.
+
+defcmd dumpcommon "" "Common kdb debugging"
+  set BTAPROMPT 0
+  set LINES 10000
+  -summary
+  -cpu
+  -ps
+  -dmesg 600
+  -bt
+endefcmd
+
+defcmd dumpall "" "First line debugging"
+  set BTSYMARG 1
+  set BTARGS 9
+  pid R
+  -dumpcommon
+  -bta
+endefcmd
+
+defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
+  set BTSYMARG 1
+  set BTARGS 9
+  pid R
+  -dumpcommon
+  -btc
+endefcmd
+
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
new file mode 100644
index 00000000000..bf6e8270e95
--- /dev/null
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -0,0 +1,169 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kdebug.h>
+#include "kdb_private.h"
+#include "../debug_core.h"
+
+/*
+ * KDB interface to KGDB internals
+ */
+get_char_func kdb_poll_funcs[] = {
+	dbg_io_get_char,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+};
+EXPORT_SYMBOL_GPL(kdb_poll_funcs);
+
+int kdb_poll_idx = 1;
+EXPORT_SYMBOL_GPL(kdb_poll_idx);
+
+int kdb_stub(struct kgdb_state *ks)
+{
+	int error = 0;
+	kdb_bp_t *bp;
+	unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+	kdb_reason_t reason = KDB_REASON_OOPS;
+	kdb_dbtrap_t db_result = KDB_DB_NOBPT;
+	int i;
+
+	if (KDB_STATE(REENTRY)) {
+		reason = KDB_REASON_SWITCH;
+		KDB_STATE_CLEAR(REENTRY);
+		addr = instruction_pointer(ks->linux_regs);
+	}
+	ks->pass_exception = 0;
+	if (atomic_read(&kgdb_setting_breakpoint))
+		reason = KDB_REASON_KEYBOARD;
+
+	for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+		if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
+			reason = KDB_REASON_BREAK;
+			db_result = KDB_DB_BPT;
+			if (addr != instruction_pointer(ks->linux_regs))
+				kgdb_arch_set_pc(ks->linux_regs, addr);
+			break;
+		}
+	}
+	if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
+		for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+			if (bp->bp_free)
+				continue;
+			if (bp->bp_addr == addr) {
+				bp->bp_delay = 1;
+				bp->bp_delayed = 1;
+	/*
+	 * SSBPT is set when the kernel debugger must single step a
+	 * task in order to re-establish an instruction breakpoint
+	 * which uses the instruction replacement mechanism.  It is
+	 * cleared by any action that removes the need to single-step
+	 * the breakpoint.
+	 */
+				reason = KDB_REASON_BREAK;
+				db_result = KDB_DB_BPT;
+				KDB_STATE_SET(SSBPT);
+				break;
+			}
+		}
+	}
+
+	if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
+		ks->signo == SIGTRAP) {
+		reason = KDB_REASON_SSTEP;
+		db_result = KDB_DB_BPT;
+	}
+	/* Set initial kdb state variables */
+	KDB_STATE_CLEAR(KGDB_TRANS);
+	kdb_initial_cpu = ks->cpu;
+	kdb_current_task = kgdb_info[ks->cpu].task;
+	kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+	/* Remove any breakpoints as needed by kdb and clear single step */
+	kdb_bp_remove();
+	KDB_STATE_CLEAR(DOING_SS);
+	KDB_STATE_CLEAR(DOING_SSB);
+	KDB_STATE_SET(PAGER);
+	/* zero out any offline cpu data */
+	for_each_present_cpu(i) {
+		if (!cpu_online(i)) {
+			kgdb_info[i].debuggerinfo = NULL;
+			kgdb_info[i].task = NULL;
+		}
+	}
+	if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
+		ks->pass_exception = 1;
+		KDB_FLAG_SET(CATASTROPHIC);
+	}
+	kdb_initial_cpu = ks->cpu;
+	if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
+		KDB_STATE_CLEAR(SSBPT);
+		KDB_STATE_CLEAR(DOING_SS);
+	} else {
+		/* Start kdb main loop */
+		error = kdb_main_loop(KDB_REASON_ENTER, reason,
+				      ks->err_code, db_result, ks->linux_regs);
+	}
+	/*
+	 * Upon exit from the kdb main loop setup break points and restart
+	 * the system based on the requested continue state
+	 */
+	kdb_initial_cpu = -1;
+	kdb_current_task = NULL;
+	kdb_current_regs = NULL;
+	KDB_STATE_CLEAR(PAGER);
+	kdbnearsym_cleanup();
+	if (error == KDB_CMD_KGDB) {
+		if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
+	/*
+	 * This inteface glue which allows kdb to transition in into
+	 * the gdb stub.  In order to do this the '?' or '' gdb serial
+	 * packet response is processed here.  And then control is
+	 * passed to the gdbstub.
+	 */
+			if (KDB_STATE(DOING_KGDB))
+				gdbstub_state(ks, "?");
+			else
+				gdbstub_state(ks, "");
+			KDB_STATE_CLEAR(DOING_KGDB);
+			KDB_STATE_CLEAR(DOING_KGDB2);
+		}
+		return DBG_PASS_EVENT;
+	}
+	kdb_bp_install(ks->linux_regs);
+	dbg_activate_sw_breakpoints();
+	/* Set the exit state to a single step or a continue */
+	if (KDB_STATE(DOING_SS))
+		gdbstub_state(ks, "s");
+	else
+		gdbstub_state(ks, "c");
+
+	KDB_FLAG_CLEAR(CATASTROPHIC);
+
+	/* Invoke arch specific exception handling prior to system resume */
+	kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
+	if (ks->pass_exception)
+		kgdb_info[ks->cpu].ret_state = 1;
+	if (error == KDB_CMD_CPU) {
+		KDB_STATE_SET(REENTRY);
+		/*
+		 * Force clear the single step bit because kdb emulates this
+		 * differently vs the gdbstub
+		 */
+		kgdb_single_step = 0;
+		dbg_deactivate_sw_breakpoints();
+		return DBG_SWITCH_CPU_EVENT;
+	}
+	return kgdb_info[ks->cpu].ret_state;
+}
+
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
new file mode 100644
index 00000000000..c9b7f4f90bb
--- /dev/null
+++ b/kernel/debug/kdb/kdb_io.c
@@ -0,0 +1,826 @@
+/*
+ * Kernel Debugger Architecture Independent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdev_t.h>
+#include <linux/console.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <linux/delay.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kallsyms.h>
+#include "kdb_private.h"
+
+#define CMD_BUFLEN 256
+char kdb_prompt_str[CMD_BUFLEN];
+
+int kdb_trap_printk;
+
+static void kgdb_transition_check(char *buffer)
+{
+	int slen = strlen(buffer);
+	if (strncmp(buffer, "$?#3f", slen) != 0 &&
+	    strncmp(buffer, "$qSupported#37", slen) != 0 &&
+	    strncmp(buffer, "+$qSupported#37", slen) != 0) {
+		KDB_STATE_SET(KGDB_TRANS);
+		kdb_printf("%s", buffer);
+	}
+}
+
+static int kdb_read_get_key(char *buffer, size_t bufsize)
+{
+#define ESCAPE_UDELAY 1000
+#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
+	char escape_data[5];	/* longest vt100 escape sequence is 4 bytes */
+	char *ped = escape_data;
+	int escape_delay = 0;
+	get_char_func *f, *f_escape = NULL;
+	int key;
+
+	for (f = &kdb_poll_funcs[0]; ; ++f) {
+		if (*f == NULL) {
+			/* Reset NMI watchdog once per poll loop */
+			touch_nmi_watchdog();
+			f = &kdb_poll_funcs[0];
+		}
+		if (escape_delay == 2) {
+			*ped = '\0';
+			ped = escape_data;
+			--escape_delay;
+		}
+		if (escape_delay == 1) {
+			key = *ped++;
+			if (!*ped)
+				--escape_delay;
+			break;
+		}
+		key = (*f)();
+		if (key == -1) {
+			if (escape_delay) {
+				udelay(ESCAPE_UDELAY);
+				--escape_delay;
+			}
+			continue;
+		}
+		if (bufsize <= 2) {
+			if (key == '\r')
+				key = '\n';
+			*buffer++ = key;
+			*buffer = '\0';
+			return -1;
+		}
+		if (escape_delay == 0 && key == '\e') {
+			escape_delay = ESCAPE_DELAY;
+			ped = escape_data;
+			f_escape = f;
+		}
+		if (escape_delay) {
+			*ped++ = key;
+			if (f_escape != f) {
+				escape_delay = 2;
+				continue;
+			}
+			if (ped - escape_data == 1) {
+				/* \e */
+				continue;
+			} else if (ped - escape_data == 2) {
+				/* \e<something> */
+				if (key != '[')
+					escape_delay = 2;
+				continue;
+			} else if (ped - escape_data == 3) {
+				/* \e[<something> */
+				int mapkey = 0;
+				switch (key) {
+				case 'A': /* \e[A, up arrow */
+					mapkey = 16;
+					break;
+				case 'B': /* \e[B, down arrow */
+					mapkey = 14;
+					break;
+				case 'C': /* \e[C, right arrow */
+					mapkey = 6;
+					break;
+				case 'D': /* \e[D, left arrow */
+					mapkey = 2;
+					break;
+				case '1': /* dropthrough */
+				case '3': /* dropthrough */
+				/* \e[<1,3,4>], may be home, del, end */
+				case '4':
+					mapkey = -1;
+					break;
+				}
+				if (mapkey != -1) {
+					if (mapkey > 0) {
+						escape_data[0] = mapkey;
+						escape_data[1] = '\0';
+					}
+					escape_delay = 2;
+				}
+				continue;
+			} else if (ped - escape_data == 4) {
+				/* \e[<1,3,4><something> */
+				int mapkey = 0;
+				if (key == '~') {
+					switch (escape_data[2]) {
+					case '1': /* \e[1~, home */
+						mapkey = 1;
+						break;
+					case '3': /* \e[3~, del */
+						mapkey = 4;
+						break;
+					case '4': /* \e[4~, end */
+						mapkey = 5;
+						break;
+					}
+				}
+				if (mapkey > 0) {
+					escape_data[0] = mapkey;
+					escape_data[1] = '\0';
+				}
+				escape_delay = 2;
+				continue;
+			}
+		}
+		break;	/* A key to process */
+	}
+	return key;
+}
+
+/*
+ * kdb_read
+ *
+ *	This function reads a string of characters, terminated by
+ *	a newline, or by reaching the end of the supplied buffer,
+ *	from the current kernel debugger console device.
+ * Parameters:
+ *	buffer	- Address of character buffer to receive input characters.
+ *	bufsize - size, in bytes, of the character buffer
+ * Returns:
+ *	Returns a pointer to the buffer containing the received
+ *	character string.  This string will be terminated by a
+ *	newline character.
+ * Locking:
+ *	No locks are required to be held upon entry to this
+ *	function.  It is not reentrant - it relies on the fact
+ *	that while kdb is running on only one "master debug" cpu.
+ * Remarks:
+ *
+ * The buffer size must be >= 2.  A buffer size of 2 means that the caller only
+ * wants a single key.
+ *
+ * An escape key could be the start of a vt100 control sequence such as \e[D
+ * (left arrow) or it could be a character in its own right.  The standard
+ * method for detecting the difference is to wait for 2 seconds to see if there
+ * are any other characters.  kdb is complicated by the lack of a timer service
+ * (interrupts are off), by multiple input sources and by the need to sometimes
+ * return after just one key.  Escape sequence processing has to be done as
+ * states in the polling loop.
+ */
+
+static char *kdb_read(char *buffer, size_t bufsize)
+{
+	char *cp = buffer;
+	char *bufend = buffer+bufsize-2;	/* Reserve space for newline
+						 * and null byte */
+	char *lastchar;
+	char *p_tmp;
+	char tmp;
+	static char tmpbuffer[CMD_BUFLEN];
+	int len = strlen(buffer);
+	int len_tmp;
+	int tab = 0;
+	int count;
+	int i;
+	int diag, dtab_count;
+	int key;
+
+
+	diag = kdbgetintenv("DTABCOUNT", &dtab_count);
+	if (diag)
+		dtab_count = 30;
+
+	if (len > 0) {
+		cp += len;
+		if (*(buffer+len-1) == '\n')
+			cp--;
+	}
+
+	lastchar = cp;
+	*cp = '\0';
+	kdb_printf("%s", buffer);
+poll_again:
+	key = kdb_read_get_key(buffer, bufsize);
+	if (key == -1)
+		return buffer;
+	if (key != 9)
+		tab = 0;
+	switch (key) {
+	case 8: /* backspace */
+		if (cp > buffer) {
+			if (cp < lastchar) {
+				memcpy(tmpbuffer, cp, lastchar - cp);
+				memcpy(cp-1, tmpbuffer, lastchar - cp);
+			}
+			*(--lastchar) = '\0';
+			--cp;
+			kdb_printf("\b%s \r", cp);
+			tmp = *cp;
+			*cp = '\0';
+			kdb_printf(kdb_prompt_str);
+			kdb_printf("%s", buffer);
+			*cp = tmp;
+		}
+		break;
+	case 13: /* enter */
+		*lastchar++ = '\n';
+		*lastchar++ = '\0';
+		kdb_printf("\n");
+		return buffer;
+	case 4: /* Del */
+		if (cp < lastchar) {
+			memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
+			memcpy(cp, tmpbuffer, lastchar - cp - 1);
+			*(--lastchar) = '\0';
+			kdb_printf("%s \r", cp);
+			tmp = *cp;
+			*cp = '\0';
+			kdb_printf(kdb_prompt_str);
+			kdb_printf("%s", buffer);
+			*cp = tmp;
+		}
+		break;
+	case 1: /* Home */
+		if (cp > buffer) {
+			kdb_printf("\r");
+			kdb_printf(kdb_prompt_str);
+			cp = buffer;
+		}
+		break;
+	case 5: /* End */
+		if (cp < lastchar) {
+			kdb_printf("%s", cp);
+			cp = lastchar;
+		}
+		break;
+	case 2: /* Left */
+		if (cp > buffer) {
+			kdb_printf("\b");
+			--cp;
+		}
+		break;
+	case 14: /* Down */
+		memset(tmpbuffer, ' ',
+		       strlen(kdb_prompt_str) + (lastchar-buffer));
+		*(tmpbuffer+strlen(kdb_prompt_str) +
+		  (lastchar-buffer)) = '\0';
+		kdb_printf("\r%s\r", tmpbuffer);
+		*lastchar = (char)key;
+		*(lastchar+1) = '\0';
+		return lastchar;
+	case 6: /* Right */
+		if (cp < lastchar) {
+			kdb_printf("%c", *cp);
+			++cp;
+		}
+		break;
+	case 16: /* Up */
+		memset(tmpbuffer, ' ',
+		       strlen(kdb_prompt_str) + (lastchar-buffer));
+		*(tmpbuffer+strlen(kdb_prompt_str) +
+		  (lastchar-buffer)) = '\0';
+		kdb_printf("\r%s\r", tmpbuffer);
+		*lastchar = (char)key;
+		*(lastchar+1) = '\0';
+		return lastchar;
+	case 9: /* Tab */
+		if (tab < 2)
+			++tab;
+		p_tmp = buffer;
+		while (*p_tmp == ' ')
+			p_tmp++;
+		if (p_tmp > cp)
+			break;
+		memcpy(tmpbuffer, p_tmp, cp-p_tmp);
+		*(tmpbuffer + (cp-p_tmp)) = '\0';
+		p_tmp = strrchr(tmpbuffer, ' ');
+		if (p_tmp)
+			++p_tmp;
+		else
+			p_tmp = tmpbuffer;
+		len = strlen(p_tmp);
+		count = kallsyms_symbol_complete(p_tmp,
+						 sizeof(tmpbuffer) -
+						 (p_tmp - tmpbuffer));
+		if (tab == 2 && count > 0) {
+			kdb_printf("\n%d symbols are found.", count);
+			if (count > dtab_count) {
+				count = dtab_count;
+				kdb_printf(" But only first %d symbols will"
+					   " be printed.\nYou can change the"
+					   " environment variable DTABCOUNT.",
+					   count);
+			}
+			kdb_printf("\n");
+			for (i = 0; i < count; i++) {
+				if (kallsyms_symbol_next(p_tmp, i) < 0)
+					break;
+				kdb_printf("%s ", p_tmp);
+				*(p_tmp + len) = '\0';
+			}
+			if (i >= dtab_count)
+				kdb_printf("...");
+			kdb_printf("\n");
+			kdb_printf(kdb_prompt_str);
+			kdb_printf("%s", buffer);
+		} else if (tab != 2 && count > 0) {
+			len_tmp = strlen(p_tmp);
+			strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
+			len_tmp = strlen(p_tmp);
+			strncpy(cp, p_tmp+len, len_tmp-len + 1);
+			len = len_tmp - len;
+			kdb_printf("%s", cp);
+			cp += len;
+			lastchar += len;
+		}
+		kdb_nextline = 1; /* reset output line number */
+		break;
+	default:
+		if (key >= 32 && lastchar < bufend) {
+			if (cp < lastchar) {
+				memcpy(tmpbuffer, cp, lastchar - cp);
+				memcpy(cp+1, tmpbuffer, lastchar - cp);
+				*++lastchar = '\0';
+				*cp = key;
+				kdb_printf("%s\r", cp);
+				++cp;
+				tmp = *cp;
+				*cp = '\0';
+				kdb_printf(kdb_prompt_str);
+				kdb_printf("%s", buffer);
+				*cp = tmp;
+			} else {
+				*++lastchar = '\0';
+				*cp++ = key;
+				/* The kgdb transition check will hide
+				 * printed characters if we think that
+				 * kgdb is connecting, until the check
+				 * fails */
+				if (!KDB_STATE(KGDB_TRANS))
+					kgdb_transition_check(buffer);
+				else
+					kdb_printf("%c", key);
+			}
+			/* Special escape to kgdb */
+			if (lastchar - buffer >= 5 &&
+			    strcmp(lastchar - 5, "$?#3f") == 0) {
+				strcpy(buffer, "kgdb");
+				KDB_STATE_SET(DOING_KGDB);
+				return buffer;
+			}
+			if (lastchar - buffer >= 14 &&
+			    strcmp(lastchar - 14, "$qSupported#37") == 0) {
+				strcpy(buffer, "kgdb");
+				KDB_STATE_SET(DOING_KGDB2);
+				return buffer;
+			}
+		}
+		break;
+	}
+	goto poll_again;
+}
+
+/*
+ * kdb_getstr
+ *
+ *	Print the prompt string and read a command from the
+ *	input device.
+ *
+ * Parameters:
+ *	buffer	Address of buffer to receive command
+ *	bufsize Size of buffer in bytes
+ *	prompt	Pointer to string to use as prompt string
+ * Returns:
+ *	Pointer to command buffer.
+ * Locking:
+ *	None.
+ * Remarks:
+ *	For SMP kernels, the processor number will be
+ *	substituted for %d, %x or %o in the prompt.
+ */
+
+char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
+{
+	if (prompt && kdb_prompt_str != prompt)
+		strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
+	kdb_printf(kdb_prompt_str);
+	kdb_nextline = 1;	/* Prompt and input resets line number */
+	return kdb_read(buffer, bufsize);
+}
+
+/*
+ * kdb_input_flush
+ *
+ *	Get rid of any buffered console input.
+ *
+ * Parameters:
+ *	none
+ * Returns:
+ *	nothing
+ * Locking:
+ *	none
+ * Remarks:
+ *	Call this function whenever you want to flush input.  If there is any
+ *	outstanding input, it ignores all characters until there has been no
+ *	data for approximately 1ms.
+ */
+
+static void kdb_input_flush(void)
+{
+	get_char_func *f;
+	int res;
+	int flush_delay = 1;
+	while (flush_delay) {
+		flush_delay--;
+empty:
+		touch_nmi_watchdog();
+		for (f = &kdb_poll_funcs[0]; *f; ++f) {
+			res = (*f)();
+			if (res != -1) {
+				flush_delay = 1;
+				goto empty;
+			}
+		}
+		if (flush_delay)
+			mdelay(1);
+	}
+}
+
+/*
+ * kdb_printf
+ *
+ *	Print a string to the output device(s).
+ *
+ * Parameters:
+ *	printf-like format and optional args.
+ * Returns:
+ *	0
+ * Locking:
+ *	None.
+ * Remarks:
+ *	use 'kdbcons->write()' to avoid polluting 'log_buf' with
+ *	kdb output.
+ *
+ *  If the user is doing a cmd args | grep srch
+ *  then kdb_grepping_flag is set.
+ *  In that case we need to accumulate full lines (ending in \n) before
+ *  searching for the pattern.
+ */
+
+static char kdb_buffer[256];	/* A bit too big to go on stack */
+static char *next_avail = kdb_buffer;
+static int  size_avail;
+static int  suspend_grep;
+
+/*
+ * search arg1 to see if it contains arg2
+ * (kdmain.c provides flags for ^pat and pat$)
+ *
+ * return 1 for found, 0 for not found
+ */
+static int kdb_search_string(char *searched, char *searchfor)
+{
+	char firstchar, *cp;
+	int len1, len2;
+
+	/* not counting the newline at the end of "searched" */
+	len1 = strlen(searched)-1;
+	len2 = strlen(searchfor);
+	if (len1 < len2)
+		return 0;
+	if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
+		return 0;
+	if (kdb_grep_leading) {
+		if (!strncmp(searched, searchfor, len2))
+			return 1;
+	} else if (kdb_grep_trailing) {
+		if (!strncmp(searched+len1-len2, searchfor, len2))
+			return 1;
+	} else {
+		firstchar = *searchfor;
+		cp = searched;
+		while ((cp = strchr(cp, firstchar))) {
+			if (!strncmp(cp, searchfor, len2))
+				return 1;
+			cp++;
+		}
+	}
+	return 0;
+}
+
+int vkdb_printf(const char *fmt, va_list ap)
+{
+	int diag;
+	int linecount;
+	int logging, saved_loglevel = 0;
+	int saved_trap_printk;
+	int got_printf_lock = 0;
+	int retlen = 0;
+	int fnd, len;
+	char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
+	char *moreprompt = "more> ";
+	struct console *c = console_drivers;
+	static DEFINE_SPINLOCK(kdb_printf_lock);
+	unsigned long uninitialized_var(flags);
+
+	preempt_disable();
+	saved_trap_printk = kdb_trap_printk;
+	kdb_trap_printk = 0;
+
+	/* Serialize kdb_printf if multiple cpus try to write at once.
+	 * But if any cpu goes recursive in kdb, just print the output,
+	 * even if it is interleaved with any other text.
+	 */
+	if (!KDB_STATE(PRINTF_LOCK)) {
+		KDB_STATE_SET(PRINTF_LOCK);
+		spin_lock_irqsave(&kdb_printf_lock, flags);
+		got_printf_lock = 1;
+		atomic_inc(&kdb_event);
+	} else {
+		__acquire(kdb_printf_lock);
+	}
+
+	diag = kdbgetintenv("LINES", &linecount);
+	if (diag || linecount <= 1)
+		linecount = 24;
+
+	diag = kdbgetintenv("LOGGING", &logging);
+	if (diag)
+		logging = 0;
+
+	if (!kdb_grepping_flag || suspend_grep) {
+		/* normally, every vsnprintf starts a new buffer */
+		next_avail = kdb_buffer;
+		size_avail = sizeof(kdb_buffer);
+	}
+	vsnprintf(next_avail, size_avail, fmt, ap);
+
+	/*
+	 * If kdb_parse() found that the command was cmd xxx | grep yyy
+	 * then kdb_grepping_flag is set, and kdb_grep_string contains yyy
+	 *
+	 * Accumulate the print data up to a newline before searching it.
+	 * (vsnprintf does null-terminate the string that it generates)
+	 */
+
+	/* skip the search if prints are temporarily unconditional */
+	if (!suspend_grep && kdb_grepping_flag) {
+		cp = strchr(kdb_buffer, '\n');
+		if (!cp) {
+			/*
+			 * Special cases that don't end with newlines
+			 * but should be written without one:
+			 *   The "[nn]kdb> " prompt should
+			 *   appear at the front of the buffer.
+			 *
+			 *   The "[nn]more " prompt should also be
+			 *     (MOREPROMPT -> moreprompt)
+			 *   written *   but we print that ourselves,
+			 *   we set the suspend_grep flag to make
+			 *   it unconditional.
+			 *
+			 */
+			if (next_avail == kdb_buffer) {
+				/*
+				 * these should occur after a newline,
+				 * so they will be at the front of the
+				 * buffer
+				 */
+				cp2 = kdb_buffer;
+				len = strlen(kdb_prompt_str);
+				if (!strncmp(cp2, kdb_prompt_str, len)) {
+					/*
+					 * We're about to start a new
+					 * command, so we can go back
+					 * to normal mode.
+					 */
+					kdb_grepping_flag = 0;
+					goto kdb_printit;
+				}
+			}
+			/* no newline; don't search/write the buffer
+			   until one is there */
+			len = strlen(kdb_buffer);
+			next_avail = kdb_buffer + len;
+			size_avail = sizeof(kdb_buffer) - len;
+			goto kdb_print_out;
+		}
+
+		/*
+		 * The newline is present; print through it or discard
+		 * it, depending on the results of the search.
+		 */
+		cp++;	 	     /* to byte after the newline */
+		replaced_byte = *cp; /* remember what/where it was */
+		cphold = cp;
+		*cp = '\0';	     /* end the string for our search */
+
+		/*
+		 * We now have a newline at the end of the string
+		 * Only continue with this output if it contains the
+		 * search string.
+		 */
+		fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
+		if (!fnd) {
+			/*
+			 * At this point the complete line at the start
+			 * of kdb_buffer can be discarded, as it does
+			 * not contain what the user is looking for.
+			 * Shift the buffer left.
+			 */
+			*cphold = replaced_byte;
+			strcpy(kdb_buffer, cphold);
+			len = strlen(kdb_buffer);
+			next_avail = kdb_buffer + len;
+			size_avail = sizeof(kdb_buffer) - len;
+			goto kdb_print_out;
+		}
+		/*
+		 * at this point the string is a full line and
+		 * should be printed, up to the null.
+		 */
+	}
+kdb_printit:
+
+	/*
+	 * Write to all consoles.
+	 */
+	retlen = strlen(kdb_buffer);
+	if (!dbg_kdb_mode && kgdb_connected) {
+		gdbstub_msg_write(kdb_buffer, retlen);
+	} else {
+		if (!dbg_io_ops->is_console) {
+			len = strlen(kdb_buffer);
+			cp = kdb_buffer;
+			while (len--) {
+				dbg_io_ops->write_char(*cp);
+				cp++;
+			}
+		}
+		while (c) {
+			c->write(c, kdb_buffer, retlen);
+			touch_nmi_watchdog();
+			c = c->next;
+		}
+	}
+	if (logging) {
+		saved_loglevel = console_loglevel;
+		console_loglevel = 0;
+		printk(KERN_INFO "%s", kdb_buffer);
+	}
+
+	if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
+		kdb_nextline++;
+
+	/* check for having reached the LINES number of printed lines */
+	if (kdb_nextline == linecount) {
+		char buf1[16] = "";
+#if defined(CONFIG_SMP)
+		char buf2[32];
+#endif
+
+		/* Watch out for recursion here.  Any routine that calls
+		 * kdb_printf will come back through here.  And kdb_read
+		 * uses kdb_printf to echo on serial consoles ...
+		 */
+		kdb_nextline = 1;	/* In case of recursion */
+
+		/*
+		 * Pause until cr.
+		 */
+		moreprompt = kdbgetenv("MOREPROMPT");
+		if (moreprompt == NULL)
+			moreprompt = "more> ";
+
+#if defined(CONFIG_SMP)
+		if (strchr(moreprompt, '%')) {
+			sprintf(buf2, moreprompt, get_cpu());
+			put_cpu();
+			moreprompt = buf2;
+		}
+#endif
+
+		kdb_input_flush();
+		c = console_drivers;
+
+		if (!dbg_io_ops->is_console) {
+			len = strlen(moreprompt);
+			cp = moreprompt;
+			while (len--) {
+				dbg_io_ops->write_char(*cp);
+				cp++;
+			}
+		}
+		while (c) {
+			c->write(c, moreprompt, strlen(moreprompt));
+			touch_nmi_watchdog();
+			c = c->next;
+		}
+
+		if (logging)
+			printk("%s", moreprompt);
+
+		kdb_read(buf1, 2); /* '2' indicates to return
+				    * immediately after getting one key. */
+		kdb_nextline = 1;	/* Really set output line 1 */
+
+		/* empty and reset the buffer: */
+		kdb_buffer[0] = '\0';
+		next_avail = kdb_buffer;
+		size_avail = sizeof(kdb_buffer);
+		if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
+			/* user hit q or Q */
+			KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
+			KDB_STATE_CLEAR(PAGER);
+			/* end of command output; back to normal mode */
+			kdb_grepping_flag = 0;
+			kdb_printf("\n");
+		} else if (buf1[0] == ' ') {
+			kdb_printf("\n");
+			suspend_grep = 1; /* for this recursion */
+		} else if (buf1[0] == '\n') {
+			kdb_nextline = linecount - 1;
+			kdb_printf("\r");
+			suspend_grep = 1; /* for this recursion */
+		} else if (buf1[0] && buf1[0] != '\n') {
+			/* user hit something other than enter */
+			suspend_grep = 1; /* for this recursion */
+			kdb_printf("\nOnly 'q' or 'Q' are processed at more "
+				   "prompt, input ignored\n");
+		} else if (kdb_grepping_flag) {
+			/* user hit enter */
+			suspend_grep = 1; /* for this recursion */
+			kdb_printf("\n");
+		}
+		kdb_input_flush();
+	}
+
+	/*
+	 * For grep searches, shift the printed string left.
+	 *  replaced_byte contains the character that was overwritten with
+	 *  the terminating null, and cphold points to the null.
+	 * Then adjust the notion of available space in the buffer.
+	 */
+	if (kdb_grepping_flag && !suspend_grep) {
+		*cphold = replaced_byte;
+		strcpy(kdb_buffer, cphold);
+		len = strlen(kdb_buffer);
+		next_avail = kdb_buffer + len;
+		size_avail = sizeof(kdb_buffer) - len;
+	}
+
+kdb_print_out:
+	suspend_grep = 0; /* end of what may have been a recursive call */
+	if (logging)
+		console_loglevel = saved_loglevel;
+	if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
+		got_printf_lock = 0;
+		spin_unlock_irqrestore(&kdb_printf_lock, flags);
+		KDB_STATE_CLEAR(PRINTF_LOCK);
+		atomic_dec(&kdb_event);
+	} else {
+		__release(kdb_printf_lock);
+	}
+	kdb_trap_printk = saved_trap_printk;
+	preempt_enable();
+	return retlen;
+}
+
+int kdb_printf(const char *fmt, ...)
+{
+	va_list ap;
+	int r;
+
+	va_start(ap, fmt);
+	r = vkdb_printf(fmt, ap);
+	va_end(ap);
+
+	return r;
+}
+
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
new file mode 100644
index 00000000000..4bca634975c
--- /dev/null
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -0,0 +1,212 @@
+/*
+ * Kernel Debugger Architecture Dependent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+
+#include <linux/kdb.h>
+#include <linux/keyboard.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/io.h>
+
+/* Keyboard Controller Registers on normal PCs. */
+
+#define KBD_STATUS_REG		0x64	/* Status register (R) */
+#define KBD_DATA_REG		0x60	/* Keyboard data register (R/W) */
+
+/* Status Register Bits */
+
+#define KBD_STAT_OBF 		0x01	/* Keyboard output buffer full */
+#define KBD_STAT_MOUSE_OBF	0x20	/* Mouse output buffer full */
+
+static int kbd_exists;
+
+/*
+ * Check if the keyboard controller has a keypress for us.
+ * Some parts (Enter Release, LED change) are still blocking polled here,
+ * but hopefully they are all short.
+ */
+int kdb_get_kbd_char(void)
+{
+	int scancode, scanstatus;
+	static int shift_lock;	/* CAPS LOCK state (0-off, 1-on) */
+	static int shift_key;	/* Shift next keypress */
+	static int ctrl_key;
+	u_short keychar;
+
+	if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
+	    (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
+		kbd_exists = 0;
+		return -1;
+	}
+	kbd_exists = 1;
+
+	if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+		return -1;
+
+	/*
+	 * Fetch the scancode
+	 */
+	scancode = inb(KBD_DATA_REG);
+	scanstatus = inb(KBD_STATUS_REG);
+
+	/*
+	 * Ignore mouse events.
+	 */
+	if (scanstatus & KBD_STAT_MOUSE_OBF)
+		return -1;
+
+	/*
+	 * Ignore release, trigger on make
+	 * (except for shift keys, where we want to
+	 *  keep the shift state so long as the key is
+	 *  held down).
+	 */
+
+	if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
+		/*
+		 * Next key may use shift table
+		 */
+		if ((scancode & 0x80) == 0)
+			shift_key = 1;
+		else
+			shift_key = 0;
+		return -1;
+	}
+
+	if ((scancode&0x7f) == 0x1d) {
+		/*
+		 * Left ctrl key
+		 */
+		if ((scancode & 0x80) == 0)
+			ctrl_key = 1;
+		else
+			ctrl_key = 0;
+		return -1;
+	}
+
+	if ((scancode & 0x80) != 0)
+		return -1;
+
+	scancode &= 0x7f;
+
+	/*
+	 * Translate scancode
+	 */
+
+	if (scancode == 0x3a) {
+		/*
+		 * Toggle caps lock
+		 */
+		shift_lock ^= 1;
+
+#ifdef	KDB_BLINK_LED
+		kdb_toggleled(0x4);
+#endif
+		return -1;
+	}
+
+	if (scancode == 0x0e) {
+		/*
+		 * Backspace
+		 */
+		return 8;
+	}
+
+	/* Special Key */
+	switch (scancode) {
+	case 0xF: /* Tab */
+		return 9;
+	case 0x53: /* Del */
+		return 4;
+	case 0x47: /* Home */
+		return 1;
+	case 0x4F: /* End */
+		return 5;
+	case 0x4B: /* Left */
+		return 2;
+	case 0x48: /* Up */
+		return 16;
+	case 0x50: /* Down */
+		return 14;
+	case 0x4D: /* Right */
+		return 6;
+	}
+
+	if (scancode == 0xe0)
+		return -1;
+
+	/*
+	 * For Japanese 86/106 keyboards
+	 * 	See comment in drivers/char/pc_keyb.c.
+	 * 	- Masahiro Adegawa
+	 */
+	if (scancode == 0x73)
+		scancode = 0x59;
+	else if (scancode == 0x7d)
+		scancode = 0x7c;
+
+	if (!shift_lock && !shift_key && !ctrl_key) {
+		keychar = plain_map[scancode];
+	} else if ((shift_lock || shift_key) && key_maps[1]) {
+		keychar = key_maps[1][scancode];
+	} else if (ctrl_key && key_maps[4]) {
+		keychar = key_maps[4][scancode];
+	} else {
+		keychar = 0x0020;
+		kdb_printf("Unknown state/scancode (%d)\n", scancode);
+	}
+	keychar &= 0x0fff;
+	if (keychar == '\t')
+		keychar = ' ';
+	switch (KTYP(keychar)) {
+	case KT_LETTER:
+	case KT_LATIN:
+		if (isprint(keychar))
+			break;		/* printable characters */
+		/* drop through */
+	case KT_SPEC:
+		if (keychar == K_ENTER)
+			break;
+		/* drop through */
+	default:
+		return -1;	/* ignore unprintables */
+	}
+
+	if ((scancode & 0x7f) == 0x1c) {
+		/*
+		 * enter key.  All done.  Absorb the release scancode.
+		 */
+		while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+			;
+
+		/*
+		 * Fetch the scancode
+		 */
+		scancode = inb(KBD_DATA_REG);
+		scanstatus = inb(KBD_STATUS_REG);
+
+		while (scanstatus & KBD_STAT_MOUSE_OBF) {
+			scancode = inb(KBD_DATA_REG);
+			scanstatus = inb(KBD_STATUS_REG);
+		}
+
+		if (scancode != 0x9c) {
+			/*
+			 * Wasn't an enter-release,  why not?
+			 */
+			kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
+			       scancode, scanstatus);
+		}
+
+		return 13;
+	}
+
+	return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
new file mode 100644
index 00000000000..caf057a3de0
--- /dev/null
+++ b/kernel/debug/kdb/kdb_main.c
@@ -0,0 +1,2956 @@
+/*
+ * Kernel Debugger Architecture Independent Main Code
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
+ * Xscale (R) modifications copyright (C) 2003 Intel Corporation.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/smp.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/notifier.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/nmi.h>
+#include <linux/time.h>
+#include <linux/ptrace.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/kdebug.h>
+#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+
+#define GREP_LEN 256
+char kdb_grep_string[GREP_LEN];
+int kdb_grepping_flag;
+EXPORT_SYMBOL(kdb_grepping_flag);
+int kdb_grep_leading;
+int kdb_grep_trailing;
+
+/*
+ * Kernel debugger state flags
+ */
+int kdb_flags;
+atomic_t kdb_event;
+
+/*
+ * kdb_lock protects updates to kdb_initial_cpu.  Used to
+ * single thread processors through the kernel debugger.
+ */
+int kdb_initial_cpu = -1;	/* cpu number that owns kdb */
+int kdb_nextline = 1;
+int kdb_state;			/* General KDB state */
+
+struct task_struct *kdb_current_task;
+EXPORT_SYMBOL(kdb_current_task);
+struct pt_regs *kdb_current_regs;
+
+const char *kdb_diemsg;
+static int kdb_go_count;
+#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC
+static unsigned int kdb_continue_catastrophic =
+	CONFIG_KDB_CONTINUE_CATASTROPHIC;
+#else
+static unsigned int kdb_continue_catastrophic;
+#endif
+
+/* kdb_commands describes the available commands. */
+static kdbtab_t *kdb_commands;
+#define KDB_BASE_CMD_MAX 50
+static int kdb_max_commands = KDB_BASE_CMD_MAX;
+static kdbtab_t kdb_base_commands[50];
+#define for_each_kdbcmd(cmd, num)					\
+	for ((cmd) = kdb_base_commands, (num) = 0;			\
+	     num < kdb_max_commands;					\
+	     num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
+
+typedef struct _kdbmsg {
+	int	km_diag;	/* kdb diagnostic */
+	char	*km_msg;	/* Corresponding message text */
+} kdbmsg_t;
+
+#define KDBMSG(msgnum, text) \
+	{ KDB_##msgnum, text }
+
+static kdbmsg_t kdbmsgs[] = {
+	KDBMSG(NOTFOUND, "Command Not Found"),
+	KDBMSG(ARGCOUNT, "Improper argument count, see usage."),
+	KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, "
+	       "8 is only allowed on 64 bit systems"),
+	KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"),
+	KDBMSG(NOTENV, "Cannot find environment variable"),
+	KDBMSG(NOENVVALUE, "Environment variable should have value"),
+	KDBMSG(NOTIMP, "Command not implemented"),
+	KDBMSG(ENVFULL, "Environment full"),
+	KDBMSG(ENVBUFFULL, "Environment buffer full"),
+	KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
+#ifdef CONFIG_CPU_XSCALE
+	KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
+#else
+	KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"),
+#endif
+	KDBMSG(DUPBPT, "Duplicate breakpoint address"),
+	KDBMSG(BPTNOTFOUND, "Breakpoint not found"),
+	KDBMSG(BADMODE, "Invalid IDMODE"),
+	KDBMSG(BADINT, "Illegal numeric value"),
+	KDBMSG(INVADDRFMT, "Invalid symbolic address format"),
+	KDBMSG(BADREG, "Invalid register name"),
+	KDBMSG(BADCPUNUM, "Invalid cpu number"),
+	KDBMSG(BADLENGTH, "Invalid length field"),
+	KDBMSG(NOBP, "No Breakpoint exists"),
+	KDBMSG(BADADDR, "Invalid address"),
+};
+#undef KDBMSG
+
+static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
+
+
+/*
+ * Initial environment.   This is all kept static and local to
+ * this file.   We don't want to rely on the memory allocation
+ * mechanisms in the kernel, so we use a very limited allocate-only
+ * heap for new and altered environment variables.  The entire
+ * environment is limited to a fixed number of entries (add more
+ * to __env[] if required) and a fixed amount of heap (add more to
+ * KDB_ENVBUFSIZE if required).
+ */
+
+static char *__env[] = {
+#if defined(CONFIG_SMP)
+ "PROMPT=[%d]kdb> ",
+ "MOREPROMPT=[%d]more> ",
+#else
+ "PROMPT=kdb> ",
+ "MOREPROMPT=more> ",
+#endif
+ "RADIX=16",
+ "MDCOUNT=8",			/* lines of md output */
+ "BTARGS=9",			/* 9 possible args in bt */
+ KDB_PLATFORM_ENV,
+ "DTABCOUNT=30",
+ "NOSECT=1",
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+};
+
+static const int __nenv = (sizeof(__env) / sizeof(char *));
+
+struct task_struct *kdb_curr_task(int cpu)
+{
+	struct task_struct *p = curr_task(cpu);
+#ifdef	_TIF_MCA_INIT
+	if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
+		p = krp->p;
+#endif
+	return p;
+}
+
+/*
+ * kdbgetenv - This function will return the character string value of
+ *	an environment variable.
+ * Parameters:
+ *	match	A character string representing an environment variable.
+ * Returns:
+ *	NULL	No environment variable matches 'match'
+ *	char*	Pointer to string value of environment variable.
+ */
+char *kdbgetenv(const char *match)
+{
+	char **ep = __env;
+	int matchlen = strlen(match);
+	int i;
+
+	for (i = 0; i < __nenv; i++) {
+		char *e = *ep++;
+
+		if (!e)
+			continue;
+
+		if ((strncmp(match, e, matchlen) == 0)
+		 && ((e[matchlen] == '\0')
+		   || (e[matchlen] == '='))) {
+			char *cp = strchr(e, '=');
+			return cp ? ++cp : "";
+		}
+	}
+	return NULL;
+}
+
+/*
+ * kdballocenv - This function is used to allocate bytes for
+ *	environment entries.
+ * Parameters:
+ *	match	A character string representing a numeric value
+ * Outputs:
+ *	*value  the unsigned long representation of the env variable 'match'
+ * Returns:
+ *	Zero on success, a kdb diagnostic on failure.
+ * Remarks:
+ *	We use a static environment buffer (envbuffer) to hold the values
+ *	of dynamically generated environment variables (see kdb_set).  Buffer
+ *	space once allocated is never free'd, so over time, the amount of space
+ *	(currently 512 bytes) will be exhausted if env variables are changed
+ *	frequently.
+ */
+static char *kdballocenv(size_t bytes)
+{
+#define	KDB_ENVBUFSIZE	512
+	static char envbuffer[KDB_ENVBUFSIZE];
+	static int envbufsize;
+	char *ep = NULL;
+
+	if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
+		ep = &envbuffer[envbufsize];
+		envbufsize += bytes;
+	}
+	return ep;
+}
+
+/*
+ * kdbgetulenv - This function will return the value of an unsigned
+ *	long-valued environment variable.
+ * Parameters:
+ *	match	A character string representing a numeric value
+ * Outputs:
+ *	*value  the unsigned long represntation of the env variable 'match'
+ * Returns:
+ *	Zero on success, a kdb diagnostic on failure.
+ */
+static int kdbgetulenv(const char *match, unsigned long *value)
+{
+	char *ep;
+
+	ep = kdbgetenv(match);
+	if (!ep)
+		return KDB_NOTENV;
+	if (strlen(ep) == 0)
+		return KDB_NOENVVALUE;
+
+	*value = simple_strtoul(ep, NULL, 0);
+
+	return 0;
+}
+
+/*
+ * kdbgetintenv - This function will return the value of an
+ *	integer-valued environment variable.
+ * Parameters:
+ *	match	A character string representing an integer-valued env variable
+ * Outputs:
+ *	*value  the integer representation of the environment variable 'match'
+ * Returns:
+ *	Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetintenv(const char *match, int *value)
+{
+	unsigned long val;
+	int diag;
+
+	diag = kdbgetulenv(match, &val);
+	if (!diag)
+		*value = (int) val;
+	return diag;
+}
+
+/*
+ * kdbgetularg - This function will convert a numeric string into an
+ *	unsigned long value.
+ * Parameters:
+ *	arg	A character string representing a numeric value
+ * Outputs:
+ *	*value  the unsigned long represntation of arg.
+ * Returns:
+ *	Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetularg(const char *arg, unsigned long *value)
+{
+	char *endp;
+	unsigned long val;
+
+	val = simple_strtoul(arg, &endp, 0);
+
+	if (endp == arg) {
+		/*
+		 * Also try base 16, for us folks too lazy to type the
+		 * leading 0x...
+		 */
+		val = simple_strtoul(arg, &endp, 16);
+		if (endp == arg)
+			return KDB_BADINT;
+	}
+
+	*value = val;
+
+	return 0;
+}
+
+int kdbgetu64arg(const char *arg, u64 *value)
+{
+	char *endp;
+	u64 val;
+
+	val = simple_strtoull(arg, &endp, 0);
+
+	if (endp == arg) {
+
+		val = simple_strtoull(arg, &endp, 16);
+		if (endp == arg)
+			return KDB_BADINT;
+	}
+
+	*value = val;
+
+	return 0;
+}
+
+/*
+ * kdb_set - This function implements the 'set' command.  Alter an
+ *	existing environment variable or create a new one.
+ */
+int kdb_set(int argc, const char **argv)
+{
+	int i;
+	char *ep;
+	size_t varlen, vallen;
+
+	/*
+	 * we can be invoked two ways:
+	 *   set var=value    argv[1]="var", argv[2]="value"
+	 *   set var = value  argv[1]="var", argv[2]="=", argv[3]="value"
+	 * - if the latter, shift 'em down.
+	 */
+	if (argc == 3) {
+		argv[2] = argv[3];
+		argc--;
+	}
+
+	if (argc != 2)
+		return KDB_ARGCOUNT;
+
+	/*
+	 * Check for internal variables
+	 */
+	if (strcmp(argv[1], "KDBDEBUG") == 0) {
+		unsigned int debugflags;
+		char *cp;
+
+		debugflags = simple_strtoul(argv[2], &cp, 0);
+		if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
+			kdb_printf("kdb: illegal debug flags '%s'\n",
+				    argv[2]);
+			return 0;
+		}
+		kdb_flags = (kdb_flags &
+			     ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
+			| (debugflags << KDB_DEBUG_FLAG_SHIFT);
+
+		return 0;
+	}
+
+	/*
+	 * Tokenizer squashed the '=' sign.  argv[1] is variable
+	 * name, argv[2] = value.
+	 */
+	varlen = strlen(argv[1]);
+	vallen = strlen(argv[2]);
+	ep = kdballocenv(varlen + vallen + 2);
+	if (ep == (char *)0)
+		return KDB_ENVBUFFULL;
+
+	sprintf(ep, "%s=%s", argv[1], argv[2]);
+
+	ep[varlen+vallen+1] = '\0';
+
+	for (i = 0; i < __nenv; i++) {
+		if (__env[i]
+		 && ((strncmp(__env[i], argv[1], varlen) == 0)
+		   && ((__env[i][varlen] == '\0')
+		    || (__env[i][varlen] == '=')))) {
+			__env[i] = ep;
+			return 0;
+		}
+	}
+
+	/*
+	 * Wasn't existing variable.  Fit into slot.
+	 */
+	for (i = 0; i < __nenv-1; i++) {
+		if (__env[i] == (char *)0) {
+			__env[i] = ep;
+			return 0;
+		}
+	}
+
+	return KDB_ENVFULL;
+}
+
+static int kdb_check_regs(void)
+{
+	if (!kdb_current_regs) {
+		kdb_printf("No current kdb registers."
+			   "  You may need to select another task\n");
+		return KDB_BADREG;
+	}
+	return 0;
+}
+
+/*
+ * kdbgetaddrarg - This function is responsible for parsing an
+ *	address-expression and returning the value of the expression,
+ *	symbol name, and offset to the caller.
+ *
+ *	The argument may consist of a numeric value (decimal or
+ *	hexidecimal), a symbol name, a register name (preceeded by the
+ *	percent sign), an environment variable with a numeric value
+ *	(preceeded by a dollar sign) or a simple arithmetic expression
+ *	consisting of a symbol name, +/-, and a numeric constant value
+ *	(offset).
+ * Parameters:
+ *	argc	- count of arguments in argv
+ *	argv	- argument vector
+ *	*nextarg - index to next unparsed argument in argv[]
+ *	regs	- Register state at time of KDB entry
+ * Outputs:
+ *	*value	- receives the value of the address-expression
+ *	*offset - receives the offset specified, if any
+ *	*name   - receives the symbol name, if any
+ *	*nextarg - index to next unparsed argument in argv[]
+ * Returns:
+ *	zero is returned on success, a kdb diagnostic code is
+ *      returned on error.
+ */
+int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
+		  unsigned long *value,  long *offset,
+		  char **name)
+{
+	unsigned long addr;
+	unsigned long off = 0;
+	int positive;
+	int diag;
+	int found = 0;
+	char *symname;
+	char symbol = '\0';
+	char *cp;
+	kdb_symtab_t symtab;
+
+	/*
+	 * Process arguments which follow the following syntax:
+	 *
+	 *  symbol | numeric-address [+/- numeric-offset]
+	 *  %register
+	 *  $environment-variable
+	 */
+
+	if (*nextarg > argc)
+		return KDB_ARGCOUNT;
+
+	symname = (char *)argv[*nextarg];
+
+	/*
+	 * If there is no whitespace between the symbol
+	 * or address and the '+' or '-' symbols, we
+	 * remember the character and replace it with a
+	 * null so the symbol/value can be properly parsed
+	 */
+	cp = strpbrk(symname, "+-");
+	if (cp != NULL) {
+		symbol = *cp;
+		*cp++ = '\0';
+	}
+
+	if (symname[0] == '$') {
+		diag = kdbgetulenv(&symname[1], &addr);
+		if (diag)
+			return diag;
+	} else if (symname[0] == '%') {
+		diag = kdb_check_regs();
+		if (diag)
+			return diag;
+		/* Implement register values with % at a later time as it is
+		 * arch optional.
+		 */
+		return KDB_NOTIMP;
+	} else {
+		found = kdbgetsymval(symname, &symtab);
+		if (found) {
+			addr = symtab.sym_start;
+		} else {
+			diag = kdbgetularg(argv[*nextarg], &addr);
+			if (diag)
+				return diag;
+		}
+	}
+
+	if (!found)
+		found = kdbnearsym(addr, &symtab);
+
+	(*nextarg)++;
+
+	if (name)
+		*name = symname;
+	if (value)
+		*value = addr;
+	if (offset && name && *name)
+		*offset = addr - symtab.sym_start;
+
+	if ((*nextarg > argc)
+	 && (symbol == '\0'))
+		return 0;
+
+	/*
+	 * check for +/- and offset
+	 */
+
+	if (symbol == '\0') {
+		if ((argv[*nextarg][0] != '+')
+		 && (argv[*nextarg][0] != '-')) {
+			/*
+			 * Not our argument.  Return.
+			 */
+			return 0;
+		} else {
+			positive = (argv[*nextarg][0] == '+');
+			(*nextarg)++;
+		}
+	} else
+		positive = (symbol == '+');
+
+	/*
+	 * Now there must be an offset!
+	 */
+	if ((*nextarg > argc)
+	 && (symbol == '\0')) {
+		return KDB_INVADDRFMT;
+	}
+
+	if (!symbol) {
+		cp = (char *)argv[*nextarg];
+		(*nextarg)++;
+	}
+
+	diag = kdbgetularg(cp, &off);
+	if (diag)
+		return diag;
+
+	if (!positive)
+		off = -off;
+
+	if (offset)
+		*offset += off;
+
+	if (value)
+		*value += off;
+
+	return 0;
+}
+
+static void kdb_cmderror(int diag)
+{
+	int i;
+
+	if (diag >= 0) {
+		kdb_printf("no error detected (diagnostic is %d)\n", diag);
+		return;
+	}
+
+	for (i = 0; i < __nkdb_err; i++) {
+		if (kdbmsgs[i].km_diag == diag) {
+			kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg);
+			return;
+		}
+	}
+
+	kdb_printf("Unknown diag %d\n", -diag);
+}
+
+/*
+ * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd'
+ *	command which defines one command as a set of other commands,
+ *	terminated by endefcmd.  kdb_defcmd processes the initial
+ *	'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for
+ *	the following commands until 'endefcmd'.
+ * Inputs:
+ *	argc	argument count
+ *	argv	argument vector
+ * Returns:
+ *	zero for success, a kdb diagnostic if error
+ */
+struct defcmd_set {
+	int count;
+	int usable;
+	char *name;
+	char *usage;
+	char *help;
+	char **command;
+};
+static struct defcmd_set *defcmd_set;
+static int defcmd_set_count;
+static int defcmd_in_progress;
+
+/* Forward references */
+static int kdb_exec_defcmd(int argc, const char **argv);
+
+static int kdb_defcmd2(const char *cmdstr, const char *argv0)
+{
+	struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
+	char **save_command = s->command;
+	if (strcmp(argv0, "endefcmd") == 0) {
+		defcmd_in_progress = 0;
+		if (!s->count)
+			s->usable = 0;
+		if (s->usable)
+			kdb_register(s->name, kdb_exec_defcmd,
+				     s->usage, s->help, 0);
+		return 0;
+	}
+	if (!s->usable)
+		return KDB_NOTIMP;
+	s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
+	if (!s->command) {
+		kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
+			   cmdstr);
+		s->usable = 0;
+		return KDB_NOTIMP;
+	}
+	memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
+	s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
+	kfree(save_command);
+	return 0;
+}
+
+static int kdb_defcmd(int argc, const char **argv)
+{
+	struct defcmd_set *save_defcmd_set = defcmd_set, *s;
+	if (defcmd_in_progress) {
+		kdb_printf("kdb: nested defcmd detected, assuming missing "
+			   "endefcmd\n");
+		kdb_defcmd2("endefcmd", "endefcmd");
+	}
+	if (argc == 0) {
+		int i;
+		for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
+			kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
+				   s->usage, s->help);
+			for (i = 0; i < s->count; ++i)
+				kdb_printf("%s", s->command[i]);
+			kdb_printf("endefcmd\n");
+		}
+		return 0;
+	}
+	if (argc != 3)
+		return KDB_ARGCOUNT;
+	defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+			     GFP_KDB);
+	if (!defcmd_set) {
+		kdb_printf("Could not allocate new defcmd_set entry for %s\n",
+			   argv[1]);
+		defcmd_set = save_defcmd_set;
+		return KDB_NOTIMP;
+	}
+	memcpy(defcmd_set, save_defcmd_set,
+	       defcmd_set_count * sizeof(*defcmd_set));
+	kfree(save_defcmd_set);
+	s = defcmd_set + defcmd_set_count;
+	memset(s, 0, sizeof(*s));
+	s->usable = 1;
+	s->name = kdb_strdup(argv[1], GFP_KDB);
+	s->usage = kdb_strdup(argv[2], GFP_KDB);
+	s->help = kdb_strdup(argv[3], GFP_KDB);
+	if (s->usage[0] == '"') {
+		strcpy(s->usage, s->usage+1);
+		s->usage[strlen(s->usage)-1] = '\0';
+	}
+	if (s->help[0] == '"') {
+		strcpy(s->help, s->help+1);
+		s->help[strlen(s->help)-1] = '\0';
+	}
+	++defcmd_set_count;
+	defcmd_in_progress = 1;
+	return 0;
+}
+
+/*
+ * kdb_exec_defcmd - Execute the set of commands associated with this
+ *	defcmd name.
+ * Inputs:
+ *	argc	argument count
+ *	argv	argument vector
+ * Returns:
+ *	zero for success, a kdb diagnostic if error
+ */
+static int kdb_exec_defcmd(int argc, const char **argv)
+{
+	int i, ret;
+	struct defcmd_set *s;
+	if (argc != 0)
+		return KDB_ARGCOUNT;
+	for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
+		if (strcmp(s->name, argv[0]) == 0)
+			break;
+	}
+	if (i == defcmd_set_count) {
+		kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
+			   argv[0]);
+		return KDB_NOTIMP;
+	}
+	for (i = 0; i < s->count; ++i) {
+		/* Recursive use of kdb_parse, do not use argv after
+		 * this point */
+		argv = NULL;
+		kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
+		ret = kdb_parse(s->command[i]);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+/* Command history */
+#define KDB_CMD_HISTORY_COUNT	32
+#define CMD_BUFLEN		200	/* kdb_printf: max printline
+					 * size == 256 */
+static unsigned int cmd_head, cmd_tail;
+static unsigned int cmdptr;
+static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN];
+static char cmd_cur[CMD_BUFLEN];
+
+/*
+ * The "str" argument may point to something like  | grep xyz
+ */
+static void parse_grep(const char *str)
+{
+	int	len;
+	char	*cp = (char *)str, *cp2;
+
+	/* sanity check: we should have been called with the \ first */
+	if (*cp != '|')
+		return;
+	cp++;
+	while (isspace(*cp))
+		cp++;
+	if (strncmp(cp, "grep ", 5)) {
+		kdb_printf("invalid 'pipe', see grephelp\n");
+		return;
+	}
+	cp += 5;
+	while (isspace(*cp))
+		cp++;
+	cp2 = strchr(cp, '\n');
+	if (cp2)
+		*cp2 = '\0'; /* remove the trailing newline */
+	len = strlen(cp);
+	if (len == 0) {
+		kdb_printf("invalid 'pipe', see grephelp\n");
+		return;
+	}
+	/* now cp points to a nonzero length search string */
+	if (*cp == '"') {
+		/* allow it be "x y z" by removing the "'s - there must
+		   be two of them */
+		cp++;
+		cp2 = strchr(cp, '"');
+		if (!cp2) {
+			kdb_printf("invalid quoted string, see grephelp\n");
+			return;
+		}
+		*cp2 = '\0'; /* end the string where the 2nd " was */
+	}
+	kdb_grep_leading = 0;
+	if (*cp == '^') {
+		kdb_grep_leading = 1;
+		cp++;
+	}
+	len = strlen(cp);
+	kdb_grep_trailing = 0;
+	if (*(cp+len-1) == '$') {
+		kdb_grep_trailing = 1;
+		*(cp+len-1) = '\0';
+	}
+	len = strlen(cp);
+	if (!len)
+		return;
+	if (len >= GREP_LEN) {
+		kdb_printf("search string too long\n");
+		return;
+	}
+	strcpy(kdb_grep_string, cp);
+	kdb_grepping_flag++;
+	return;
+}
+
+/*
+ * kdb_parse - Parse the command line, search the command table for a
+ *	matching command and invoke the command function.  This
+ *	function may be called recursively, if it is, the second call
+ *	will overwrite argv and cbuf.  It is the caller's
+ *	responsibility to save their argv if they recursively call
+ *	kdb_parse().
+ * Parameters:
+ *      cmdstr	The input command line to be parsed.
+ *	regs	The registers at the time kdb was entered.
+ * Returns:
+ *	Zero for success, a kdb diagnostic if failure.
+ * Remarks:
+ *	Limited to 20 tokens.
+ *
+ *	Real rudimentary tokenization. Basically only whitespace
+ *	is considered a token delimeter (but special consideration
+ *	is taken of the '=' sign as used by the 'set' command).
+ *
+ *	The algorithm used to tokenize the input string relies on
+ *	there being at least one whitespace (or otherwise useless)
+ *	character between tokens as the character immediately following
+ *	the token is altered in-place to a null-byte to terminate the
+ *	token string.
+ */
+
+#define MAXARGC	20
+
+int kdb_parse(const char *cmdstr)
+{
+	static char *argv[MAXARGC];
+	static int argc;
+	static char cbuf[CMD_BUFLEN+2];
+	char *cp;
+	char *cpp, quoted;
+	kdbtab_t *tp;
+	int i, escaped, ignore_errors = 0, check_grep;
+
+	/*
+	 * First tokenize the command string.
+	 */
+	cp = (char *)cmdstr;
+	kdb_grepping_flag = check_grep = 0;
+
+	if (KDB_FLAG(CMD_INTERRUPT)) {
+		/* Previous command was interrupted, newline must not
+		 * repeat the command */
+		KDB_FLAG_CLEAR(CMD_INTERRUPT);
+		KDB_STATE_SET(PAGER);
+		argc = 0;	/* no repeat */
+	}
+
+	if (*cp != '\n' && *cp != '\0') {
+		argc = 0;
+		cpp = cbuf;
+		while (*cp) {
+			/* skip whitespace */
+			while (isspace(*cp))
+				cp++;
+			if ((*cp == '\0') || (*cp == '\n') ||
+			    (*cp == '#' && !defcmd_in_progress))
+				break;
+			/* special case: check for | grep pattern */
+			if (*cp == '|') {
+				check_grep++;
+				break;
+			}
+			if (cpp >= cbuf + CMD_BUFLEN) {
+				kdb_printf("kdb_parse: command buffer "
+					   "overflow, command ignored\n%s\n",
+					   cmdstr);
+				return KDB_NOTFOUND;
+			}
+			if (argc >= MAXARGC - 1) {
+				kdb_printf("kdb_parse: too many arguments, "
+					   "command ignored\n%s\n", cmdstr);
+				return KDB_NOTFOUND;
+			}
+			argv[argc++] = cpp;
+			escaped = 0;
+			quoted = '\0';
+			/* Copy to next unquoted and unescaped
+			 * whitespace or '=' */
+			while (*cp && *cp != '\n' &&
+			       (escaped || quoted || !isspace(*cp))) {
+				if (cpp >= cbuf + CMD_BUFLEN)
+					break;
+				if (escaped) {
+					escaped = 0;
+					*cpp++ = *cp++;
+					continue;
+				}
+				if (*cp == '\\') {
+					escaped = 1;
+					++cp;
+					continue;
+				}
+				if (*cp == quoted)
+					quoted = '\0';
+				else if (*cp == '\'' || *cp == '"')
+					quoted = *cp;
+				*cpp = *cp++;
+				if (*cpp == '=' && !quoted)
+					break;
+				++cpp;
+			}
+			*cpp++ = '\0';	/* Squash a ws or '=' character */
+		}
+	}
+	if (!argc)
+		return 0;
+	if (check_grep)
+		parse_grep(cp);
+	if (defcmd_in_progress) {
+		int result = kdb_defcmd2(cmdstr, argv[0]);
+		if (!defcmd_in_progress) {
+			argc = 0;	/* avoid repeat on endefcmd */
+			*(argv[0]) = '\0';
+		}
+		return result;
+	}
+	if (argv[0][0] == '-' && argv[0][1] &&
+	    (argv[0][1] < '0' || argv[0][1] > '9')) {
+		ignore_errors = 1;
+		++argv[0];
+	}
+
+	for_each_kdbcmd(tp, i) {
+		if (tp->cmd_name) {
+			/*
+			 * If this command is allowed to be abbreviated,
+			 * check to see if this is it.
+			 */
+
+			if (tp->cmd_minlen
+			 && (strlen(argv[0]) <= tp->cmd_minlen)) {
+				if (strncmp(argv[0],
+					    tp->cmd_name,
+					    tp->cmd_minlen) == 0) {
+					break;
+				}
+			}
+
+			if (strcmp(argv[0], tp->cmd_name) == 0)
+				break;
+		}
+	}
+
+	/*
+	 * If we don't find a command by this name, see if the first
+	 * few characters of this match any of the known commands.
+	 * e.g., md1c20 should match md.
+	 */
+	if (i == kdb_max_commands) {
+		for_each_kdbcmd(tp, i) {
+			if (tp->cmd_name) {
+				if (strncmp(argv[0],
+					    tp->cmd_name,
+					    strlen(tp->cmd_name)) == 0) {
+					break;
+				}
+			}
+		}
+	}
+
+	if (i < kdb_max_commands) {
+		int result;
+		KDB_STATE_SET(CMD);
+		result = (*tp->cmd_func)(argc-1, (const char **)argv);
+		if (result && ignore_errors && result > KDB_CMD_GO)
+			result = 0;
+		KDB_STATE_CLEAR(CMD);
+		switch (tp->cmd_repeat) {
+		case KDB_REPEAT_NONE:
+			argc = 0;
+			if (argv[0])
+				*(argv[0]) = '\0';
+			break;
+		case KDB_REPEAT_NO_ARGS:
+			argc = 1;
+			if (argv[1])
+				*(argv[1]) = '\0';
+			break;
+		case KDB_REPEAT_WITH_ARGS:
+			break;
+		}
+		return result;
+	}
+
+	/*
+	 * If the input with which we were presented does not
+	 * map to an existing command, attempt to parse it as an
+	 * address argument and display the result.   Useful for
+	 * obtaining the address of a variable, or the nearest symbol
+	 * to an address contained in a register.
+	 */
+	{
+		unsigned long value;
+		char *name = NULL;
+		long offset;
+		int nextarg = 0;
+
+		if (kdbgetaddrarg(0, (const char **)argv, &nextarg,
+				  &value, &offset, &name)) {
+			return KDB_NOTFOUND;
+		}
+
+		kdb_printf("%s = ", argv[0]);
+		kdb_symbol_print(value, NULL, KDB_SP_DEFAULT);
+		kdb_printf("\n");
+		return 0;
+	}
+}
+
+
+static int handle_ctrl_cmd(char *cmd)
+{
+#define CTRL_P	16
+#define CTRL_N	14
+
+	/* initial situation */
+	if (cmd_head == cmd_tail)
+		return 0;
+	switch (*cmd) {
+	case CTRL_P:
+		if (cmdptr != cmd_tail)
+			cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
+		strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+		return 1;
+	case CTRL_N:
+		if (cmdptr != cmd_head)
+			cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
+		strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * kdb_reboot - This function implements the 'reboot' command.  Reboot
+ *	the system immediately, or loop for ever on failure.
+ */
+static int kdb_reboot(int argc, const char **argv)
+{
+	emergency_restart();
+	kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n");
+	while (1)
+		cpu_relax();
+	/* NOTREACHED */
+	return 0;
+}
+
+static void kdb_dumpregs(struct pt_regs *regs)
+{
+	int old_lvl = console_loglevel;
+	console_loglevel = 15;
+	kdb_trap_printk++;
+	show_regs(regs);
+	kdb_trap_printk--;
+	kdb_printf("\n");
+	console_loglevel = old_lvl;
+}
+
+void kdb_set_current_task(struct task_struct *p)
+{
+	kdb_current_task = p;
+
+	if (kdb_task_has_cpu(p)) {
+		kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p));
+		return;
+	}
+	kdb_current_regs = NULL;
+}
+
+/*
+ * kdb_local - The main code for kdb.  This routine is invoked on a
+ *	specific processor, it is not global.  The main kdb() routine
+ *	ensures that only one processor at a time is in this routine.
+ *	This code is called with the real reason code on the first
+ *	entry to a kdb session, thereafter it is called with reason
+ *	SWITCH, even if the user goes back to the original cpu.
+ * Inputs:
+ *	reason		The reason KDB was invoked
+ *	error		The hardware-defined error code
+ *	regs		The exception frame at time of fault/breakpoint.
+ *	db_result	Result code from the break or debug point.
+ * Returns:
+ *	0	KDB was invoked for an event which it wasn't responsible
+ *	1	KDB handled the event for which it was invoked.
+ *	KDB_CMD_GO	User typed 'go'.
+ *	KDB_CMD_CPU	User switched to another cpu.
+ *	KDB_CMD_SS	Single step.
+ *	KDB_CMD_SSB	Single step until branch.
+ */
+static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
+		     kdb_dbtrap_t db_result)
+{
+	char *cmdbuf;
+	int diag;
+	struct task_struct *kdb_current =
+		kdb_curr_task(raw_smp_processor_id());
+
+	KDB_DEBUG_STATE("kdb_local 1", reason);
+	kdb_go_count = 0;
+	if (reason == KDB_REASON_DEBUG) {
+		/* special case below */
+	} else {
+		kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
+			   kdb_current, kdb_current->pid);
+#if defined(CONFIG_SMP)
+		kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+	}
+
+	switch (reason) {
+	case KDB_REASON_DEBUG:
+	{
+		/*
+		 * If re-entering kdb after a single step
+		 * command, don't print the message.
+		 */
+		switch (db_result) {
+		case KDB_DB_BPT:
+			kdb_printf("\nEntering kdb (0x%p, pid %d) ",
+				   kdb_current, kdb_current->pid);
+#if defined(CONFIG_SMP)
+			kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+			kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
+				   instruction_pointer(regs));
+			break;
+		case KDB_DB_SSB:
+			/*
+			 * In the midst of ssb command. Just return.
+			 */
+			KDB_DEBUG_STATE("kdb_local 3", reason);
+			return KDB_CMD_SSB;	/* Continue with SSB command */
+
+			break;
+		case KDB_DB_SS:
+			break;
+		case KDB_DB_SSBPT:
+			KDB_DEBUG_STATE("kdb_local 4", reason);
+			return 1;	/* kdba_db_trap did the work */
+		default:
+			kdb_printf("kdb: Bad result from kdba_db_trap: %d\n",
+				   db_result);
+			break;
+		}
+
+	}
+		break;
+	case KDB_REASON_ENTER:
+		if (KDB_STATE(KEYBOARD))
+			kdb_printf("due to Keyboard Entry\n");
+		else
+			kdb_printf("due to KDB_ENTER()\n");
+		break;
+	case KDB_REASON_KEYBOARD:
+		KDB_STATE_SET(KEYBOARD);
+		kdb_printf("due to Keyboard Entry\n");
+		break;
+	case KDB_REASON_ENTER_SLAVE:
+		/* drop through, slaves only get released via cpu switch */
+	case KDB_REASON_SWITCH:
+		kdb_printf("due to cpu switch\n");
+		break;
+	case KDB_REASON_OOPS:
+		kdb_printf("Oops: %s\n", kdb_diemsg);
+		kdb_printf("due to oops @ " kdb_machreg_fmt "\n",
+			   instruction_pointer(regs));
+		kdb_dumpregs(regs);
+		break;
+	case KDB_REASON_NMI:
+		kdb_printf("due to NonMaskable Interrupt @ "
+			   kdb_machreg_fmt "\n",
+			   instruction_pointer(regs));
+		kdb_dumpregs(regs);
+		break;
+	case KDB_REASON_SSTEP:
+	case KDB_REASON_BREAK:
+		kdb_printf("due to %s @ " kdb_machreg_fmt "\n",
+			   reason == KDB_REASON_BREAK ?
+			   "Breakpoint" : "SS trap", instruction_pointer(regs));
+		/*
+		 * Determine if this breakpoint is one that we
+		 * are interested in.
+		 */
+		if (db_result != KDB_DB_BPT) {
+			kdb_printf("kdb: error return from kdba_bp_trap: %d\n",
+				   db_result);
+			KDB_DEBUG_STATE("kdb_local 6", reason);
+			return 0;	/* Not for us, dismiss it */
+		}
+		break;
+	case KDB_REASON_RECURSE:
+		kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n",
+			   instruction_pointer(regs));
+		break;
+	default:
+		kdb_printf("kdb: unexpected reason code: %d\n", reason);
+		KDB_DEBUG_STATE("kdb_local 8", reason);
+		return 0;	/* Not for us, dismiss it */
+	}
+
+	while (1) {
+		/*
+		 * Initialize pager context.
+		 */
+		kdb_nextline = 1;
+		KDB_STATE_CLEAR(SUPPRESS);
+
+		cmdbuf = cmd_cur;
+		*cmdbuf = '\0';
+		*(cmd_hist[cmd_head]) = '\0';
+
+		if (KDB_FLAG(ONLY_DO_DUMP)) {
+			/* kdb is off but a catastrophic error requires a dump.
+			 * Take the dump and reboot.
+			 * Turn on logging so the kdb output appears in the log
+			 * buffer in the dump.
+			 */
+			const char *setargs[] = { "set", "LOGGING", "1" };
+			kdb_set(2, setargs);
+			kdb_reboot(0, NULL);
+			/*NOTREACHED*/
+		}
+
+do_full_getstr:
+#if defined(CONFIG_SMP)
+		snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
+			 raw_smp_processor_id());
+#else
+		snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
+#endif
+		if (defcmd_in_progress)
+			strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
+
+		/*
+		 * Fetch command from keyboard
+		 */
+		cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str);
+		if (*cmdbuf != '\n') {
+			if (*cmdbuf < 32) {
+				if (cmdptr == cmd_head) {
+					strncpy(cmd_hist[cmd_head], cmd_cur,
+						CMD_BUFLEN);
+					*(cmd_hist[cmd_head] +
+					  strlen(cmd_hist[cmd_head])-1) = '\0';
+				}
+				if (!handle_ctrl_cmd(cmdbuf))
+					*(cmd_cur+strlen(cmd_cur)-1) = '\0';
+				cmdbuf = cmd_cur;
+				goto do_full_getstr;
+			} else {
+				strncpy(cmd_hist[cmd_head], cmd_cur,
+					CMD_BUFLEN);
+			}
+
+			cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT;
+			if (cmd_head == cmd_tail)
+				cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT;
+		}
+
+		cmdptr = cmd_head;
+		diag = kdb_parse(cmdbuf);
+		if (diag == KDB_NOTFOUND) {
+			kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
+			diag = 0;
+		}
+		if (diag == KDB_CMD_GO
+		 || diag == KDB_CMD_CPU
+		 || diag == KDB_CMD_SS
+		 || diag == KDB_CMD_SSB
+		 || diag == KDB_CMD_KGDB)
+			break;
+
+		if (diag)
+			kdb_cmderror(diag);
+	}
+	KDB_DEBUG_STATE("kdb_local 9", diag);
+	return diag;
+}
+
+
+/*
+ * kdb_print_state - Print the state data for the current processor
+ *	for debugging.
+ * Inputs:
+ *	text		Identifies the debug point
+ *	value		Any integer value to be printed, e.g. reason code.
+ */
+void kdb_print_state(const char *text, int value)
+{
+	kdb_printf("state: %s cpu %d value %d initial %d state %x\n",
+		   text, raw_smp_processor_id(), value, kdb_initial_cpu,
+		   kdb_state);
+}
+
+/*
+ * kdb_main_loop - After initial setup and assignment of the
+ *	controlling cpu, all cpus are in this loop.  One cpu is in
+ *	control and will issue the kdb prompt, the others will spin
+ *	until 'go' or cpu switch.
+ *
+ *	To get a consistent view of the kernel stacks for all
+ *	processes, this routine is invoked from the main kdb code via
+ *	an architecture specific routine.  kdba_main_loop is
+ *	responsible for making the kernel stacks consistent for all
+ *	processes, there should be no difference between a blocked
+ *	process and a running process as far as kdb is concerned.
+ * Inputs:
+ *	reason		The reason KDB was invoked
+ *	error		The hardware-defined error code
+ *	reason2		kdb's current reason code.
+ *			Initially error but can change
+ *			acording to kdb state.
+ *	db_result	Result code from break or debug point.
+ *	regs		The exception frame at time of fault/breakpoint.
+ *			should always be valid.
+ * Returns:
+ *	0	KDB was invoked for an event which it wasn't responsible
+ *	1	KDB handled the event for which it was invoked.
+ */
+int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
+	      kdb_dbtrap_t db_result, struct pt_regs *regs)
+{
+	int result = 1;
+	/* Stay in kdb() until 'go', 'ss[b]' or an error */
+	while (1) {
+		/*
+		 * All processors except the one that is in control
+		 * will spin here.
+		 */
+		KDB_DEBUG_STATE("kdb_main_loop 1", reason);
+		while (KDB_STATE(HOLD_CPU)) {
+			/* state KDB is turned off by kdb_cpu to see if the
+			 * other cpus are still live, each cpu in this loop
+			 * turns it back on.
+			 */
+			if (!KDB_STATE(KDB))
+				KDB_STATE_SET(KDB);
+		}
+
+		KDB_STATE_CLEAR(SUPPRESS);
+		KDB_DEBUG_STATE("kdb_main_loop 2", reason);
+		if (KDB_STATE(LEAVING))
+			break;	/* Another cpu said 'go' */
+		/* Still using kdb, this processor is in control */
+		result = kdb_local(reason2, error, regs, db_result);
+		KDB_DEBUG_STATE("kdb_main_loop 3", result);
+
+		if (result == KDB_CMD_CPU)
+			break;
+
+		if (result == KDB_CMD_SS) {
+			KDB_STATE_SET(DOING_SS);
+			break;
+		}
+
+		if (result == KDB_CMD_SSB) {
+			KDB_STATE_SET(DOING_SS);
+			KDB_STATE_SET(DOING_SSB);
+			break;
+		}
+
+		if (result == KDB_CMD_KGDB) {
+			if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)))
+				kdb_printf("Entering please attach debugger "
+					   "or use $D#44+ or $3#33\n");
+			break;
+		}
+		if (result && result != 1 && result != KDB_CMD_GO)
+			kdb_printf("\nUnexpected kdb_local return code %d\n",
+				   result);
+		KDB_DEBUG_STATE("kdb_main_loop 4", reason);
+		break;
+	}
+	if (KDB_STATE(DOING_SS))
+		KDB_STATE_CLEAR(SSBPT);
+
+	return result;
+}
+
+/*
+ * kdb_mdr - This function implements the guts of the 'mdr', memory
+ * read command.
+ *	mdr  <addr arg>,<byte count>
+ * Inputs:
+ *	addr	Start address
+ *	count	Number of bytes
+ * Returns:
+ *	Always 0.  Any errors are detected and printed by kdb_getarea.
+ */
+static int kdb_mdr(unsigned long addr, unsigned int count)
+{
+	unsigned char c;
+	while (count--) {
+		if (kdb_getarea(c, addr))
+			return 0;
+		kdb_printf("%02x", c);
+		addr++;
+	}
+	kdb_printf("\n");
+	return 0;
+}
+
+/*
+ * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4',
+ *	'md8' 'mdr' and 'mds' commands.
+ *
+ *	md|mds  [<addr arg> [<line count> [<radix>]]]
+ *	mdWcN	[<addr arg> [<line count> [<radix>]]]
+ *		where W = is the width (1, 2, 4 or 8) and N is the count.
+ *		for eg., md1c20 reads 20 bytes, 1 at a time.
+ *	mdr  <addr arg>,<byte count>
+ */
+static void kdb_md_line(const char *fmtstr, unsigned long addr,
+			int symbolic, int nosect, int bytesperword,
+			int num, int repeat, int phys)
+{
+	/* print just one line of data */
+	kdb_symtab_t symtab;
+	char cbuf[32];
+	char *c = cbuf;
+	int i;
+	unsigned long word;
+
+	memset(cbuf, '\0', sizeof(cbuf));
+	if (phys)
+		kdb_printf("phys " kdb_machreg_fmt0 " ", addr);
+	else
+		kdb_printf(kdb_machreg_fmt0 " ", addr);
+
+	for (i = 0; i < num && repeat--; i++) {
+		if (phys) {
+			if (kdb_getphysword(&word, addr, bytesperword))
+				break;
+		} else if (kdb_getword(&word, addr, bytesperword))
+			break;
+		kdb_printf(fmtstr, word);
+		if (symbolic)
+			kdbnearsym(word, &symtab);
+		else
+			memset(&symtab, 0, sizeof(symtab));
+		if (symtab.sym_name) {
+			kdb_symbol_print(word, &symtab, 0);
+			if (!nosect) {
+				kdb_printf("\n");
+				kdb_printf("                       %s %s "
+					   kdb_machreg_fmt " "
+					   kdb_machreg_fmt " "
+					   kdb_machreg_fmt, symtab.mod_name,
+					   symtab.sec_name, symtab.sec_start,
+					   symtab.sym_start, symtab.sym_end);
+			}
+			addr += bytesperword;
+		} else {
+			union {
+				u64 word;
+				unsigned char c[8];
+			} wc;
+			unsigned char *cp;
+#ifdef	__BIG_ENDIAN
+			cp = wc.c + 8 - bytesperword;
+#else
+			cp = wc.c;
+#endif
+			wc.word = word;
+#define printable_char(c) \
+	({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
+			switch (bytesperword) {
+			case 8:
+				*c++ = printable_char(*cp++);
+				*c++ = printable_char(*cp++);
+				*c++ = printable_char(*cp++);
+				*c++ = printable_char(*cp++);
+				addr += 4;
+			case 4:
+				*c++ = printable_char(*cp++);
+				*c++ = printable_char(*cp++);
+				addr += 2;
+			case 2:
+				*c++ = printable_char(*cp++);
+				addr++;
+			case 1:
+				*c++ = printable_char(*cp++);
+				addr++;
+				break;
+			}
+#undef printable_char
+		}
+	}
+	kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1),
+		   " ", cbuf);
+}
+
+static int kdb_md(int argc, const char **argv)
+{
+	static unsigned long last_addr;
+	static int last_radix, last_bytesperword, last_repeat;
+	int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat;
+	int nosect = 0;
+	char fmtchar, fmtstr[64];
+	unsigned long addr;
+	unsigned long word;
+	long offset = 0;
+	int symbolic = 0;
+	int valid = 0;
+	int phys = 0;
+
+	kdbgetintenv("MDCOUNT", &mdcount);
+	kdbgetintenv("RADIX", &radix);
+	kdbgetintenv("BYTESPERWORD", &bytesperword);
+
+	/* Assume 'md <addr>' and start with environment values */
+	repeat = mdcount * 16 / bytesperword;
+
+	if (strcmp(argv[0], "mdr") == 0) {
+		if (argc != 2)
+			return KDB_ARGCOUNT;
+		valid = 1;
+	} else if (isdigit(argv[0][2])) {
+		bytesperword = (int)(argv[0][2] - '0');
+		if (bytesperword == 0) {
+			bytesperword = last_bytesperword;
+			if (bytesperword == 0)
+				bytesperword = 4;
+		}
+		last_bytesperword = bytesperword;
+		repeat = mdcount * 16 / bytesperword;
+		if (!argv[0][3])
+			valid = 1;
+		else if (argv[0][3] == 'c' && argv[0][4]) {
+			char *p;
+			repeat = simple_strtoul(argv[0] + 4, &p, 10);
+			mdcount = ((repeat * bytesperword) + 15) / 16;
+			valid = !*p;
+		}
+		last_repeat = repeat;
+	} else if (strcmp(argv[0], "md") == 0)
+		valid = 1;
+	else if (strcmp(argv[0], "mds") == 0)
+		valid = 1;
+	else if (strcmp(argv[0], "mdp") == 0) {
+		phys = valid = 1;
+	}
+	if (!valid)
+		return KDB_NOTFOUND;
+
+	if (argc == 0) {
+		if (last_addr == 0)
+			return KDB_ARGCOUNT;
+		addr = last_addr;
+		radix = last_radix;
+		bytesperword = last_bytesperword;
+		repeat = last_repeat;
+		mdcount = ((repeat * bytesperword) + 15) / 16;
+	}
+
+	if (argc) {
+		unsigned long val;
+		int diag, nextarg = 1;
+		diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+				     &offset, NULL);
+		if (diag)
+			return diag;
+		if (argc > nextarg+2)
+			return KDB_ARGCOUNT;
+
+		if (argc >= nextarg) {
+			diag = kdbgetularg(argv[nextarg], &val);
+			if (!diag) {
+				mdcount = (int) val;
+				repeat = mdcount * 16 / bytesperword;
+			}
+		}
+		if (argc >= nextarg+1) {
+			diag = kdbgetularg(argv[nextarg+1], &val);
+			if (!diag)
+				radix = (int) val;
+		}
+	}
+
+	if (strcmp(argv[0], "mdr") == 0)
+		return kdb_mdr(addr, mdcount);
+
+	switch (radix) {
+	case 10:
+		fmtchar = 'd';
+		break;
+	case 16:
+		fmtchar = 'x';
+		break;
+	case 8:
+		fmtchar = 'o';
+		break;
+	default:
+		return KDB_BADRADIX;
+	}
+
+	last_radix = radix;
+
+	if (bytesperword > KDB_WORD_SIZE)
+		return KDB_BADWIDTH;
+
+	switch (bytesperword) {
+	case 8:
+		sprintf(fmtstr, "%%16.16l%c ", fmtchar);
+		break;
+	case 4:
+		sprintf(fmtstr, "%%8.8l%c ", fmtchar);
+		break;
+	case 2:
+		sprintf(fmtstr, "%%4.4l%c ", fmtchar);
+		break;
+	case 1:
+		sprintf(fmtstr, "%%2.2l%c ", fmtchar);
+		break;
+	default:
+		return KDB_BADWIDTH;
+	}
+
+	last_repeat = repeat;
+	last_bytesperword = bytesperword;
+
+	if (strcmp(argv[0], "mds") == 0) {
+		symbolic = 1;
+		/* Do not save these changes as last_*, they are temporary mds
+		 * overrides.
+		 */
+		bytesperword = KDB_WORD_SIZE;
+		repeat = mdcount;
+		kdbgetintenv("NOSECT", &nosect);
+	}
+
+	/* Round address down modulo BYTESPERWORD */
+
+	addr &= ~(bytesperword-1);
+
+	while (repeat > 0) {
+		unsigned long a;
+		int n, z, num = (symbolic ? 1 : (16 / bytesperword));
+
+		if (KDB_FLAG(CMD_INTERRUPT))
+			return 0;
+		for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) {
+			if (phys) {
+				if (kdb_getphysword(&word, a, bytesperword)
+						|| word)
+					break;
+			} else if (kdb_getword(&word, a, bytesperword) || word)
+				break;
+		}
+		n = min(num, repeat);
+		kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword,
+			    num, repeat, phys);
+		addr += bytesperword * n;
+		repeat -= n;
+		z = (z + num - 1) / num;
+		if (z > 2) {
+			int s = num * (z-2);
+			kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0
+				   " zero suppressed\n",
+				addr, addr + bytesperword * s - 1);
+			addr += bytesperword * s;
+			repeat -= s;
+		}
+	}
+	last_addr = addr;
+
+	return 0;
+}
+
+/*
+ * kdb_mm - This function implements the 'mm' command.
+ *	mm address-expression new-value
+ * Remarks:
+ *	mm works on machine words, mmW works on bytes.
+ */
+static int kdb_mm(int argc, const char **argv)
+{
+	int diag;
+	unsigned long addr;
+	long offset = 0;
+	unsigned long contents;
+	int nextarg;
+	int width;
+
+	if (argv[0][2] && !isdigit(argv[0][2]))
+		return KDB_NOTFOUND;
+
+	if (argc < 2)
+		return KDB_ARGCOUNT;
+
+	nextarg = 1;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+	if (diag)
+		return diag;
+
+	if (nextarg > argc)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL);
+	if (diag)
+		return diag;
+
+	if (nextarg != argc + 1)
+		return KDB_ARGCOUNT;
+
+	width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE);
+	diag = kdb_putword(addr, contents, width);
+	if (diag)
+		return diag;
+
+	kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents);
+
+	return 0;
+}
+
+/*
+ * kdb_go - This function implements the 'go' command.
+ *	go [address-expression]
+ */
+static int kdb_go(int argc, const char **argv)
+{
+	unsigned long addr;
+	int diag;
+	int nextarg;
+	long offset;
+
+	if (argc == 1) {
+		if (raw_smp_processor_id() != kdb_initial_cpu) {
+			kdb_printf("go <address> must be issued from the "
+				   "initial cpu, do cpu %d first\n",
+				   kdb_initial_cpu);
+			return KDB_ARGCOUNT;
+		}
+		nextarg = 1;
+		diag = kdbgetaddrarg(argc, argv, &nextarg,
+				     &addr, &offset, NULL);
+		if (diag)
+			return diag;
+	} else if (argc) {
+		return KDB_ARGCOUNT;
+	}
+
+	diag = KDB_CMD_GO;
+	if (KDB_FLAG(CATASTROPHIC)) {
+		kdb_printf("Catastrophic error detected\n");
+		kdb_printf("kdb_continue_catastrophic=%d, ",
+			kdb_continue_catastrophic);
+		if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) {
+			kdb_printf("type go a second time if you really want "
+				   "to continue\n");
+			return 0;
+		}
+		if (kdb_continue_catastrophic == 2) {
+			kdb_printf("forcing reboot\n");
+			kdb_reboot(0, NULL);
+		}
+		kdb_printf("attempting to continue\n");
+	}
+	return diag;
+}
+
+/*
+ * kdb_rd - This function implements the 'rd' command.
+ */
+static int kdb_rd(int argc, const char **argv)
+{
+	int len = kdb_check_regs();
+#if DBG_MAX_REG_NUM > 0
+	int i;
+	char *rname;
+	int rsize;
+	u64 reg64;
+	u32 reg32;
+	u16 reg16;
+	u8 reg8;
+
+	if (len)
+		return len;
+
+	for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+		rsize = dbg_reg_def[i].size * 2;
+		if (rsize > 16)
+			rsize = 2;
+		if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
+			len = 0;
+			kdb_printf("\n");
+		}
+		if (len)
+			len += kdb_printf("  ");
+		switch(dbg_reg_def[i].size * 8) {
+		case 8:
+			rname = dbg_get_reg(i, &reg8, kdb_current_regs);
+			if (!rname)
+				break;
+			len += kdb_printf("%s: %02x", rname, reg8);
+			break;
+		case 16:
+			rname = dbg_get_reg(i, &reg16, kdb_current_regs);
+			if (!rname)
+				break;
+			len += kdb_printf("%s: %04x", rname, reg16);
+			break;
+		case 32:
+			rname = dbg_get_reg(i, &reg32, kdb_current_regs);
+			if (!rname)
+				break;
+			len += kdb_printf("%s: %08x", rname, reg32);
+			break;
+		case 64:
+			rname = dbg_get_reg(i, &reg64, kdb_current_regs);
+			if (!rname)
+				break;
+			len += kdb_printf("%s: %016llx", rname, reg64);
+			break;
+		default:
+			len += kdb_printf("%s: ??", dbg_reg_def[i].name);
+		}
+	}
+	kdb_printf("\n");
+#else
+	if (len)
+		return len;
+
+	kdb_dumpregs(kdb_current_regs);
+#endif
+	return 0;
+}
+
+/*
+ * kdb_rm - This function implements the 'rm' (register modify)  command.
+ *	rm register-name new-contents
+ * Remarks:
+ *	Allows register modification with the same restrictions as gdb
+ */
+static int kdb_rm(int argc, const char **argv)
+{
+#if DBG_MAX_REG_NUM > 0
+	int diag;
+	const char *rname;
+	int i;
+	u64 reg64;
+	u32 reg32;
+	u16 reg16;
+	u8 reg8;
+
+	if (argc != 2)
+		return KDB_ARGCOUNT;
+	/*
+	 * Allow presence or absence of leading '%' symbol.
+	 */
+	rname = argv[1];
+	if (*rname == '%')
+		rname++;
+
+	diag = kdbgetu64arg(argv[2], &reg64);
+	if (diag)
+		return diag;
+
+	diag = kdb_check_regs();
+	if (diag)
+		return diag;
+
+	diag = KDB_BADREG;
+	for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+		if (strcmp(rname, dbg_reg_def[i].name) == 0) {
+			diag = 0;
+			break;
+		}
+	}
+	if (!diag) {
+		switch(dbg_reg_def[i].size * 8) {
+		case 8:
+			reg8 = reg64;
+			dbg_set_reg(i, &reg8, kdb_current_regs);
+			break;
+		case 16:
+			reg16 = reg64;
+			dbg_set_reg(i, &reg16, kdb_current_regs);
+			break;
+		case 32:
+			reg32 = reg64;
+			dbg_set_reg(i, &reg32, kdb_current_regs);
+			break;
+		case 64:
+			dbg_set_reg(i, &reg64, kdb_current_regs);
+			break;
+		}
+	}
+	return diag;
+#else
+	kdb_printf("ERROR: Register set currently not implemented\n");
+    return 0;
+#endif
+}
+
+#if defined(CONFIG_MAGIC_SYSRQ)
+/*
+ * kdb_sr - This function implements the 'sr' (SYSRQ key) command
+ *	which interfaces to the soi-disant MAGIC SYSRQ functionality.
+ *		sr <magic-sysrq-code>
+ */
+static int kdb_sr(int argc, const char **argv)
+{
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	kdb_trap_printk++;
+	__handle_sysrq(*argv[1], false);
+	kdb_trap_printk--;
+
+	return 0;
+}
+#endif	/* CONFIG_MAGIC_SYSRQ */
+
+/*
+ * kdb_ef - This function implements the 'regs' (display exception
+ *	frame) command.  This command takes an address and expects to
+ *	find an exception frame at that address, formats and prints
+ *	it.
+ *		regs address-expression
+ * Remarks:
+ *	Not done yet.
+ */
+static int kdb_ef(int argc, const char **argv)
+{
+	int diag;
+	unsigned long addr;
+	long offset;
+	int nextarg;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	nextarg = 1;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+	if (diag)
+		return diag;
+	show_regs((struct pt_regs *)addr);
+	return 0;
+}
+
+#if defined(CONFIG_MODULES)
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command.  Lists
+ *	currently loaded kernel modules.
+ *	Mostly taken from userland lsmod.
+ */
+static int kdb_lsmod(int argc, const char **argv)
+{
+	struct module *mod;
+
+	if (argc != 0)
+		return KDB_ARGCOUNT;
+
+	kdb_printf("Module                  Size  modstruct     Used by\n");
+	list_for_each_entry(mod, kdb_modules, list) {
+
+		kdb_printf("%-20s%8u  0x%p ", mod->name,
+			   mod->core_size, (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+		kdb_printf("%4d ", module_refcount(mod));
+#endif
+		if (mod->state == MODULE_STATE_GOING)
+			kdb_printf(" (Unloading)");
+		else if (mod->state == MODULE_STATE_COMING)
+			kdb_printf(" (Loading)");
+		else
+			kdb_printf(" (Live)");
+		kdb_printf(" 0x%p", mod->module_core);
+
+#ifdef CONFIG_MODULE_UNLOAD
+		{
+			struct module_use *use;
+			kdb_printf(" [ ");
+			list_for_each_entry(use, &mod->source_list,
+					    source_list)
+				kdb_printf("%s ", use->target->name);
+			kdb_printf("]\n");
+		}
+#endif
+	}
+
+	return 0;
+}
+
+#endif	/* CONFIG_MODULES */
+
+/*
+ * kdb_env - This function implements the 'env' command.  Display the
+ *	current environment variables.
+ */
+
+static int kdb_env(int argc, const char **argv)
+{
+	int i;
+
+	for (i = 0; i < __nenv; i++) {
+		if (__env[i])
+			kdb_printf("%s\n", __env[i]);
+	}
+
+	if (KDB_DEBUG(MASK))
+		kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
+
+	return 0;
+}
+
+#ifdef CONFIG_PRINTK
+/*
+ * kdb_dmesg - This function implements the 'dmesg' command to display
+ *	the contents of the syslog buffer.
+ *		dmesg [lines] [adjust]
+ */
+static int kdb_dmesg(int argc, const char **argv)
+{
+	char *syslog_data[4], *start, *end, c = '\0', *p;
+	int diag, logging, logsize, lines = 0, adjust = 0, n;
+
+	if (argc > 2)
+		return KDB_ARGCOUNT;
+	if (argc) {
+		char *cp;
+		lines = simple_strtol(argv[1], &cp, 0);
+		if (*cp)
+			lines = 0;
+		if (argc > 1) {
+			adjust = simple_strtoul(argv[2], &cp, 0);
+			if (*cp || adjust < 0)
+				adjust = 0;
+		}
+	}
+
+	/* disable LOGGING if set */
+	diag = kdbgetintenv("LOGGING", &logging);
+	if (!diag && logging) {
+		const char *setargs[] = { "set", "LOGGING", "0" };
+		kdb_set(2, setargs);
+	}
+
+	/* syslog_data[0,1] physical start, end+1.  syslog_data[2,3]
+	 * logical start, end+1. */
+	kdb_syslog_data(syslog_data);
+	if (syslog_data[2] == syslog_data[3])
+		return 0;
+	logsize = syslog_data[1] - syslog_data[0];
+	start = syslog_data[2];
+	end = syslog_data[3];
+#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
+	for (n = 0, p = start; p < end; ++p) {
+		c = *KDB_WRAP(p);
+		if (c == '\n')
+			++n;
+	}
+	if (c != '\n')
+		++n;
+	if (lines < 0) {
+		if (adjust >= n)
+			kdb_printf("buffer only contains %d lines, nothing "
+				   "printed\n", n);
+		else if (adjust - lines >= n)
+			kdb_printf("buffer only contains %d lines, last %d "
+				   "lines printed\n", n, n - adjust);
+		if (adjust) {
+			for (; start < end && adjust; ++start) {
+				if (*KDB_WRAP(start) == '\n')
+					--adjust;
+			}
+			if (start < end)
+				++start;
+		}
+		for (p = start; p < end && lines; ++p) {
+			if (*KDB_WRAP(p) == '\n')
+				++lines;
+		}
+		end = p;
+	} else if (lines > 0) {
+		int skip = n - (adjust + lines);
+		if (adjust >= n) {
+			kdb_printf("buffer only contains %d lines, "
+				   "nothing printed\n", n);
+			skip = n;
+		} else if (skip < 0) {
+			lines += skip;
+			skip = 0;
+			kdb_printf("buffer only contains %d lines, first "
+				   "%d lines printed\n", n, lines);
+		}
+		for (; start < end && skip; ++start) {
+			if (*KDB_WRAP(start) == '\n')
+				--skip;
+		}
+		for (p = start; p < end && lines; ++p) {
+			if (*KDB_WRAP(p) == '\n')
+				--lines;
+		}
+		end = p;
+	}
+	/* Do a line at a time (max 200 chars) to reduce protocol overhead */
+	c = '\n';
+	while (start != end) {
+		char buf[201];
+		p = buf;
+		if (KDB_FLAG(CMD_INTERRUPT))
+			return 0;
+		while (start < end && (c = *KDB_WRAP(start)) &&
+		       (p - buf) < sizeof(buf)-1) {
+			++start;
+			*p++ = c;
+			if (c == '\n')
+				break;
+		}
+		*p = '\0';
+		kdb_printf("%s", buf);
+	}
+	if (c != '\n')
+		kdb_printf("\n");
+
+	return 0;
+}
+#endif /* CONFIG_PRINTK */
+/*
+ * kdb_cpu - This function implements the 'cpu' command.
+ *	cpu	[<cpunum>]
+ * Returns:
+ *	KDB_CMD_CPU for success, a kdb diagnostic if error
+ */
+static void kdb_cpu_status(void)
+{
+	int i, start_cpu, first_print = 1;
+	char state, prev_state = '?';
+
+	kdb_printf("Currently on cpu %d\n", raw_smp_processor_id());
+	kdb_printf("Available cpus: ");
+	for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i)) {
+			state = 'F';	/* cpu is offline */
+		} else {
+			state = ' ';	/* cpu is responding to kdb */
+			if (kdb_task_state_char(KDB_TSK(i)) == 'I')
+				state = 'I';	/* idle task */
+		}
+		if (state != prev_state) {
+			if (prev_state != '?') {
+				if (!first_print)
+					kdb_printf(", ");
+				first_print = 0;
+				kdb_printf("%d", start_cpu);
+				if (start_cpu < i-1)
+					kdb_printf("-%d", i-1);
+				if (prev_state != ' ')
+					kdb_printf("(%c)", prev_state);
+			}
+			prev_state = state;
+			start_cpu = i;
+		}
+	}
+	/* print the trailing cpus, ignoring them if they are all offline */
+	if (prev_state != 'F') {
+		if (!first_print)
+			kdb_printf(", ");
+		kdb_printf("%d", start_cpu);
+		if (start_cpu < i-1)
+			kdb_printf("-%d", i-1);
+		if (prev_state != ' ')
+			kdb_printf("(%c)", prev_state);
+	}
+	kdb_printf("\n");
+}
+
+static int kdb_cpu(int argc, const char **argv)
+{
+	unsigned long cpunum;
+	int diag;
+
+	if (argc == 0) {
+		kdb_cpu_status();
+		return 0;
+	}
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	diag = kdbgetularg(argv[1], &cpunum);
+	if (diag)
+		return diag;
+
+	/*
+	 * Validate cpunum
+	 */
+	if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+		return KDB_BADCPUNUM;
+
+	dbg_switch_cpu = cpunum;
+
+	/*
+	 * Switch to other cpu
+	 */
+	return KDB_CMD_CPU;
+}
+
+/* The user may not realize that ps/bta with no parameters does not print idle
+ * or sleeping system daemon processes, so tell them how many were suppressed.
+ */
+void kdb_ps_suppressed(void)
+{
+	int idle = 0, daemon = 0;
+	unsigned long mask_I = kdb_task_state_string("I"),
+		      mask_M = kdb_task_state_string("M");
+	unsigned long cpu;
+	const struct task_struct *p, *g;
+	for_each_online_cpu(cpu) {
+		p = kdb_curr_task(cpu);
+		if (kdb_task_state(p, mask_I))
+			++idle;
+	}
+	kdb_do_each_thread(g, p) {
+		if (kdb_task_state(p, mask_M))
+			++daemon;
+	} kdb_while_each_thread(g, p);
+	if (idle || daemon) {
+		if (idle)
+			kdb_printf("%d idle process%s (state I)%s\n",
+				   idle, idle == 1 ? "" : "es",
+				   daemon ? " and " : "");
+		if (daemon)
+			kdb_printf("%d sleeping system daemon (state M) "
+				   "process%s", daemon,
+				   daemon == 1 ? "" : "es");
+		kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
+	}
+}
+
+/*
+ * kdb_ps - This function implements the 'ps' command which shows a
+ *	list of the active processes.
+ *		ps [DRSTCZEUIMA]   All processes, optionally filtered by state
+ */
+void kdb_ps1(const struct task_struct *p)
+{
+	int cpu;
+	unsigned long tmp;
+
+	if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+		return;
+
+	cpu = kdb_process_cpu(p);
+	kdb_printf("0x%p %8d %8d  %d %4d   %c  0x%p %c%s\n",
+		   (void *)p, p->pid, p->parent->pid,
+		   kdb_task_has_cpu(p), kdb_process_cpu(p),
+		   kdb_task_state_char(p),
+		   (void *)(&p->thread),
+		   p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
+		   p->comm);
+	if (kdb_task_has_cpu(p)) {
+		if (!KDB_TSK(cpu)) {
+			kdb_printf("  Error: no saved data for this cpu\n");
+		} else {
+			if (KDB_TSK(cpu) != p)
+				kdb_printf("  Error: does not match running "
+				   "process table (0x%p)\n", KDB_TSK(cpu));
+		}
+	}
+}
+
+static int kdb_ps(int argc, const char **argv)
+{
+	struct task_struct *g, *p;
+	unsigned long mask, cpu;
+
+	if (argc == 0)
+		kdb_ps_suppressed();
+	kdb_printf("%-*s      Pid   Parent [*] cpu State %-*s Command\n",
+		(int)(2*sizeof(void *))+2, "Task Addr",
+		(int)(2*sizeof(void *))+2, "Thread");
+	mask = kdb_task_state_string(argc ? argv[1] : NULL);
+	/* Run the active tasks first */
+	for_each_online_cpu(cpu) {
+		if (KDB_FLAG(CMD_INTERRUPT))
+			return 0;
+		p = kdb_curr_task(cpu);
+		if (kdb_task_state(p, mask))
+			kdb_ps1(p);
+	}
+	kdb_printf("\n");
+	/* Now the real tasks */
+	kdb_do_each_thread(g, p) {
+		if (KDB_FLAG(CMD_INTERRUPT))
+			return 0;
+		if (kdb_task_state(p, mask))
+			kdb_ps1(p);
+	} kdb_while_each_thread(g, p);
+
+	return 0;
+}
+
+/*
+ * kdb_pid - This function implements the 'pid' command which switches
+ *	the currently active process.
+ *		pid [<pid> | R]
+ */
+static int kdb_pid(int argc, const char **argv)
+{
+	struct task_struct *p;
+	unsigned long val;
+	int diag;
+
+	if (argc > 1)
+		return KDB_ARGCOUNT;
+
+	if (argc) {
+		if (strcmp(argv[1], "R") == 0) {
+			p = KDB_TSK(kdb_initial_cpu);
+		} else {
+			diag = kdbgetularg(argv[1], &val);
+			if (diag)
+				return KDB_BADINT;
+
+			p = find_task_by_pid_ns((pid_t)val,	&init_pid_ns);
+			if (!p) {
+				kdb_printf("No task with pid=%d\n", (pid_t)val);
+				return 0;
+			}
+		}
+		kdb_set_current_task(p);
+	}
+	kdb_printf("KDB current process is %s(pid=%d)\n",
+		   kdb_current_task->comm,
+		   kdb_current_task->pid);
+
+	return 0;
+}
+
+/*
+ * kdb_ll - This function implements the 'll' command which follows a
+ *	linked list and executes an arbitrary command for each
+ *	element.
+ */
+static int kdb_ll(int argc, const char **argv)
+{
+	int diag;
+	unsigned long addr;
+	long offset = 0;
+	unsigned long va;
+	unsigned long linkoffset;
+	int nextarg;
+	const char *command;
+
+	if (argc != 3)
+		return KDB_ARGCOUNT;
+
+	nextarg = 1;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+	if (diag)
+		return diag;
+
+	diag = kdbgetularg(argv[2], &linkoffset);
+	if (diag)
+		return diag;
+
+	/*
+	 * Using the starting address as
+	 * the first element in the list, and assuming that
+	 * the list ends with a null pointer.
+	 */
+
+	va = addr;
+	command = kdb_strdup(argv[3], GFP_KDB);
+	if (!command) {
+		kdb_printf("%s: cannot duplicate command\n", __func__);
+		return 0;
+	}
+	/* Recursive use of kdb_parse, do not use argv after this point */
+	argv = NULL;
+
+	while (va) {
+		char buf[80];
+
+		if (KDB_FLAG(CMD_INTERRUPT))
+			return 0;
+
+		sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
+		diag = kdb_parse(buf);
+		if (diag)
+			return diag;
+
+		addr = va + linkoffset;
+		if (kdb_getword(&va, addr, sizeof(va)))
+			return 0;
+	}
+	kfree(command);
+
+	return 0;
+}
+
+static int kdb_kgdb(int argc, const char **argv)
+{
+	return KDB_CMD_KGDB;
+}
+
+/*
+ * kdb_help - This function implements the 'help' and '?' commands.
+ */
+static int kdb_help(int argc, const char **argv)
+{
+	kdbtab_t *kt;
+	int i;
+
+	kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
+	kdb_printf("-----------------------------"
+		   "-----------------------------\n");
+	for_each_kdbcmd(kt, i) {
+		if (kt->cmd_name)
+			kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
+				   kt->cmd_usage, kt->cmd_help);
+		if (KDB_FLAG(CMD_INTERRUPT))
+			return 0;
+	}
+	return 0;
+}
+
+/*
+ * kdb_kill - This function implements the 'kill' commands.
+ */
+static int kdb_kill(int argc, const char **argv)
+{
+	long sig, pid;
+	char *endp;
+	struct task_struct *p;
+	struct siginfo info;
+
+	if (argc != 2)
+		return KDB_ARGCOUNT;
+
+	sig = simple_strtol(argv[1], &endp, 0);
+	if (*endp)
+		return KDB_BADINT;
+	if (sig >= 0) {
+		kdb_printf("Invalid signal parameter.<-signal>\n");
+		return 0;
+	}
+	sig = -sig;
+
+	pid = simple_strtol(argv[2], &endp, 0);
+	if (*endp)
+		return KDB_BADINT;
+	if (pid <= 0) {
+		kdb_printf("Process ID must be large than 0.\n");
+		return 0;
+	}
+
+	/* Find the process. */
+	p = find_task_by_pid_ns(pid, &init_pid_ns);
+	if (!p) {
+		kdb_printf("The specified process isn't found.\n");
+		return 0;
+	}
+	p = p->group_leader;
+	info.si_signo = sig;
+	info.si_errno = 0;
+	info.si_code = SI_USER;
+	info.si_pid = pid;  /* same capabilities as process being signalled */
+	info.si_uid = 0;    /* kdb has root authority */
+	kdb_send_sig_info(p, &info);
+	return 0;
+}
+
+struct kdb_tm {
+	int tm_sec;	/* seconds */
+	int tm_min;	/* minutes */
+	int tm_hour;	/* hours */
+	int tm_mday;	/* day of the month */
+	int tm_mon;	/* month */
+	int tm_year;	/* year */
+};
+
+static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
+{
+	/* This will work from 1970-2099, 2100 is not a leap year */
+	static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
+				 31, 30, 31, 30, 31 };
+	memset(tm, 0, sizeof(*tm));
+	tm->tm_sec  = tv->tv_sec % (24 * 60 * 60);
+	tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
+		(2 * 365 + 1); /* shift base from 1970 to 1968 */
+	tm->tm_min =  tm->tm_sec / 60 % 60;
+	tm->tm_hour = tm->tm_sec / 60 / 60;
+	tm->tm_sec =  tm->tm_sec % 60;
+	tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
+	tm->tm_mday %= (4*365+1);
+	mon_day[1] = 29;
+	while (tm->tm_mday >= mon_day[tm->tm_mon]) {
+		tm->tm_mday -= mon_day[tm->tm_mon];
+		if (++tm->tm_mon == 12) {
+			tm->tm_mon = 0;
+			++tm->tm_year;
+			mon_day[1] = 28;
+		}
+	}
+	++tm->tm_mday;
+}
+
+/*
+ * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
+ * I cannot call that code directly from kdb, it has an unconditional
+ * cli()/sti() and calls routines that take locks which can stop the debugger.
+ */
+static void kdb_sysinfo(struct sysinfo *val)
+{
+	struct timespec uptime;
+	do_posix_clock_monotonic_gettime(&uptime);
+	memset(val, 0, sizeof(*val));
+	val->uptime = uptime.tv_sec;
+	val->loads[0] = avenrun[0];
+	val->loads[1] = avenrun[1];
+	val->loads[2] = avenrun[2];
+	val->procs = nr_threads-1;
+	si_meminfo(val);
+
+	return;
+}
+
+/*
+ * kdb_summary - This function implements the 'summary' command.
+ */
+static int kdb_summary(int argc, const char **argv)
+{
+	struct timespec now;
+	struct kdb_tm tm;
+	struct sysinfo val;
+
+	if (argc)
+		return KDB_ARGCOUNT;
+
+	kdb_printf("sysname    %s\n", init_uts_ns.name.sysname);
+	kdb_printf("release    %s\n", init_uts_ns.name.release);
+	kdb_printf("version    %s\n", init_uts_ns.name.version);
+	kdb_printf("machine    %s\n", init_uts_ns.name.machine);
+	kdb_printf("nodename   %s\n", init_uts_ns.name.nodename);
+	kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
+	kdb_printf("ccversion  %s\n", __stringify(CCVERSION));
+
+	now = __current_kernel_time();
+	kdb_gmtime(&now, &tm);
+	kdb_printf("date       %04d-%02d-%02d %02d:%02d:%02d "
+		   "tz_minuteswest %d\n",
+		1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
+		tm.tm_hour, tm.tm_min, tm.tm_sec,
+		sys_tz.tz_minuteswest);
+
+	kdb_sysinfo(&val);
+	kdb_printf("uptime     ");
+	if (val.uptime > (24*60*60)) {
+		int days = val.uptime / (24*60*60);
+		val.uptime %= (24*60*60);
+		kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
+	}
+	kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
+
+	/* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+	kdb_printf("load avg   %ld.%02ld %ld.%02ld %ld.%02ld\n",
+		LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
+		LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
+		LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
+#undef LOAD_INT
+#undef LOAD_FRAC
+	/* Display in kilobytes */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+	kdb_printf("\nMemTotal:       %8lu kB\nMemFree:        %8lu kB\n"
+		   "Buffers:        %8lu kB\n",
+		   val.totalram, val.freeram, val.bufferram);
+	return 0;
+}
+
+/*
+ * kdb_per_cpu - This function implements the 'per_cpu' command.
+ */
+static int kdb_per_cpu(int argc, const char **argv)
+{
+	char buf[256], fmtstr[64];
+	kdb_symtab_t symtab;
+	cpumask_t suppress = CPU_MASK_NONE;
+	int cpu, diag;
+	unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
+
+	if (argc < 1 || argc > 3)
+		return KDB_ARGCOUNT;
+
+	snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
+	if (!kdbgetsymval(buf, &symtab)) {
+		kdb_printf("%s is not a per_cpu variable\n", argv[1]);
+		return KDB_BADADDR;
+	}
+	if (argc >= 2) {
+		diag = kdbgetularg(argv[2], &bytesperword);
+		if (diag)
+			return diag;
+	}
+	if (!bytesperword)
+		bytesperword = KDB_WORD_SIZE;
+	else if (bytesperword > KDB_WORD_SIZE)
+		return KDB_BADWIDTH;
+	sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword));
+	if (argc >= 3) {
+		diag = kdbgetularg(argv[3], &whichcpu);
+		if (diag)
+			return diag;
+		if (!cpu_online(whichcpu)) {
+			kdb_printf("cpu %ld is not online\n", whichcpu);
+			return KDB_BADCPUNUM;
+		}
+	}
+
+	/* Most architectures use __per_cpu_offset[cpu], some use
+	 * __per_cpu_offset(cpu), smp has no __per_cpu_offset.
+	 */
+#ifdef	__per_cpu_offset
+#define KDB_PCU(cpu) __per_cpu_offset(cpu)
+#else
+#ifdef	CONFIG_SMP
+#define KDB_PCU(cpu) __per_cpu_offset[cpu]
+#else
+#define KDB_PCU(cpu) 0
+#endif
+#endif
+
+	for_each_online_cpu(cpu) {
+		if (whichcpu != ~0UL && whichcpu != cpu)
+			continue;
+		addr = symtab.sym_start + KDB_PCU(cpu);
+		diag = kdb_getword(&val, addr, bytesperword);
+		if (diag) {
+			kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
+				   "read, diag=%d\n", cpu, addr, diag);
+			continue;
+		}
+#ifdef	CONFIG_SMP
+		if (!val) {
+			cpu_set(cpu, suppress);
+			continue;
+		}
+#endif	/* CONFIG_SMP */
+		kdb_printf("%5d ", cpu);
+		kdb_md_line(fmtstr, addr,
+			bytesperword == KDB_WORD_SIZE,
+			1, bytesperword, 1, 1, 0);
+	}
+	if (cpus_weight(suppress) == 0)
+		return 0;
+	kdb_printf("Zero suppressed cpu(s):");
+	for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
+	     cpu = next_cpu(cpu, suppress)) {
+		kdb_printf(" %d", cpu);
+		if (cpu == num_possible_cpus() - 1 ||
+		    next_cpu(cpu, suppress) != cpu + 1)
+			continue;
+		while (cpu < num_possible_cpus() &&
+		       next_cpu(cpu, suppress) == cpu + 1)
+			++cpu;
+		kdb_printf("-%d", cpu);
+	}
+	kdb_printf("\n");
+
+#undef KDB_PCU
+
+	return 0;
+}
+
+/*
+ * display help for the use of cmd | grep pattern
+ */
+static int kdb_grep_help(int argc, const char **argv)
+{
+	kdb_printf("Usage of  cmd args | grep pattern:\n");
+	kdb_printf("  Any command's output may be filtered through an ");
+	kdb_printf("emulated 'pipe'.\n");
+	kdb_printf("  'grep' is just a key word.\n");
+	kdb_printf("  The pattern may include a very limited set of "
+		   "metacharacters:\n");
+	kdb_printf("   pattern or ^pattern or pattern$ or ^pattern$\n");
+	kdb_printf("  And if there are spaces in the pattern, you may "
+		   "quote it:\n");
+	kdb_printf("   \"pat tern\" or \"^pat tern\" or \"pat tern$\""
+		   " or \"^pat tern$\"\n");
+	return 0;
+}
+
+/*
+ * kdb_register_repeat - This function is used to register a kernel
+ * 	debugger command.
+ * Inputs:
+ *	cmd	Command name
+ *	func	Function to execute the command
+ *	usage	A simple usage string showing arguments
+ *	help	A simple help string describing command
+ *	repeat	Does the command auto repeat on enter?
+ * Returns:
+ *	zero for success, one if a duplicate command.
+ */
+#define kdb_command_extend 50	/* arbitrary */
+int kdb_register_repeat(char *cmd,
+			kdb_func_t func,
+			char *usage,
+			char *help,
+			short minlen,
+			kdb_repeat_t repeat)
+{
+	int i;
+	kdbtab_t *kp;
+
+	/*
+	 *  Brute force method to determine duplicates
+	 */
+	for_each_kdbcmd(kp, i) {
+		if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+			kdb_printf("Duplicate kdb command registered: "
+				"%s, func %p help %s\n", cmd, func, help);
+			return 1;
+		}
+	}
+
+	/*
+	 * Insert command into first available location in table
+	 */
+	for_each_kdbcmd(kp, i) {
+		if (kp->cmd_name == NULL)
+			break;
+	}
+
+	if (i >= kdb_max_commands) {
+		kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
+			 kdb_command_extend) * sizeof(*new), GFP_KDB);
+		if (!new) {
+			kdb_printf("Could not allocate new kdb_command "
+				   "table\n");
+			return 1;
+		}
+		if (kdb_commands) {
+			memcpy(new, kdb_commands,
+			       kdb_max_commands * sizeof(*new));
+			kfree(kdb_commands);
+		}
+		memset(new + kdb_max_commands, 0,
+		       kdb_command_extend * sizeof(*new));
+		kdb_commands = new;
+		kp = kdb_commands + kdb_max_commands;
+		kdb_max_commands += kdb_command_extend;
+	}
+
+	kp->cmd_name   = cmd;
+	kp->cmd_func   = func;
+	kp->cmd_usage  = usage;
+	kp->cmd_help   = help;
+	kp->cmd_flags  = 0;
+	kp->cmd_minlen = minlen;
+	kp->cmd_repeat = repeat;
+
+	return 0;
+}
+
+/*
+ * kdb_register - Compatibility register function for commands that do
+ *	not need to specify a repeat state.  Equivalent to
+ *	kdb_register_repeat with KDB_REPEAT_NONE.
+ * Inputs:
+ *	cmd	Command name
+ *	func	Function to execute the command
+ *	usage	A simple usage string showing arguments
+ *	help	A simple help string describing command
+ * Returns:
+ *	zero for success, one if a duplicate command.
+ */
+int kdb_register(char *cmd,
+	     kdb_func_t func,
+	     char *usage,
+	     char *help,
+	     short minlen)
+{
+	return kdb_register_repeat(cmd, func, usage, help, minlen,
+				   KDB_REPEAT_NONE);
+}
+
+/*
+ * kdb_unregister - This function is used to unregister a kernel
+ *	debugger command.  It is generally called when a module which
+ *	implements kdb commands is unloaded.
+ * Inputs:
+ *	cmd	Command name
+ * Returns:
+ *	zero for success, one command not registered.
+ */
+int kdb_unregister(char *cmd)
+{
+	int i;
+	kdbtab_t *kp;
+
+	/*
+	 *  find the command.
+	 */
+	for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) {
+		if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+			kp->cmd_name = NULL;
+			return 0;
+		}
+	}
+
+	/* Couldn't find it.  */
+	return 1;
+}
+
+/* Initialize the kdb command table. */
+static void __init kdb_inittab(void)
+{
+	int i;
+	kdbtab_t *kp;
+
+	for_each_kdbcmd(kp, i)
+		kp->cmd_name = NULL;
+
+	kdb_register_repeat("md", kdb_md, "<vaddr>",
+	  "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
+			    KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
+	  "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
+	  "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("mds", kdb_md, "<vaddr>",
+	  "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
+	  "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("go", kdb_go, "[<vaddr>]",
+	  "Continue Execution", 1, KDB_REPEAT_NONE);
+	kdb_register_repeat("rd", kdb_rd, "",
+	  "Display Registers", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
+	  "Modify Registers", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("ef", kdb_ef, "<vaddr>",
+	  "Display exception frame", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
+	  "Stack traceback", 1, KDB_REPEAT_NONE);
+	kdb_register_repeat("btp", kdb_bt, "<pid>",
+	  "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
+	  "Display stack all processes", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("btc", kdb_bt, "",
+	  "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("btt", kdb_bt, "<vaddr>",
+	  "Backtrace process given its struct task address", 0,
+			    KDB_REPEAT_NONE);
+	kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
+	  "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("env", kdb_env, "",
+	  "Show environment variables", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("set", kdb_set, "",
+	  "Set environment variables", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("help", kdb_help, "",
+	  "Display Help Message", 1, KDB_REPEAT_NONE);
+	kdb_register_repeat("?", kdb_help, "",
+	  "Display Help Message", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
+	  "Switch to new cpu", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("kgdb", kdb_kgdb, "",
+	  "Enter kgdb mode", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
+	  "Display active task list", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("pid", kdb_pid, "<pidnum>",
+	  "Switch to another task", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("reboot", kdb_reboot, "",
+	  "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
+#if defined(CONFIG_MODULES)
+	kdb_register_repeat("lsmod", kdb_lsmod, "",
+	  "List loaded kernel modules", 0, KDB_REPEAT_NONE);
+#endif
+#if defined(CONFIG_MAGIC_SYSRQ)
+	kdb_register_repeat("sr", kdb_sr, "<key>",
+	  "Magic SysRq key", 0, KDB_REPEAT_NONE);
+#endif
+#if defined(CONFIG_PRINTK)
+	kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
+	  "Display syslog buffer", 0, KDB_REPEAT_NONE);
+#endif
+	kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
+	  "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
+	  "Send a signal to a process", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("summary", kdb_summary, "",
+	  "Summarize the system", 4, KDB_REPEAT_NONE);
+	kdb_register_repeat("per_cpu", kdb_per_cpu, "",
+	  "Display per_cpu variables", 3, KDB_REPEAT_NONE);
+	kdb_register_repeat("grephelp", kdb_grep_help, "",
+	  "Display help on | grep", 0, KDB_REPEAT_NONE);
+}
+
+/* Execute any commands defined in kdb_cmds.  */
+static void __init kdb_cmd_init(void)
+{
+	int i, diag;
+	for (i = 0; kdb_cmds[i]; ++i) {
+		diag = kdb_parse(kdb_cmds[i]);
+		if (diag)
+			kdb_printf("kdb command %s failed, kdb diag %d\n",
+				kdb_cmds[i], diag);
+	}
+	if (defcmd_in_progress) {
+		kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n");
+		kdb_parse("endefcmd");
+	}
+}
+
+/* Intialize kdb_printf, breakpoint tables and kdb state */
+void __init kdb_init(int lvl)
+{
+	static int kdb_init_lvl = KDB_NOT_INITIALIZED;
+	int i;
+
+	if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl)
+		return;
+	for (i = kdb_init_lvl; i < lvl; i++) {
+		switch (i) {
+		case KDB_NOT_INITIALIZED:
+			kdb_inittab();		/* Initialize Command Table */
+			kdb_initbptab();	/* Initialize Breakpoints */
+			break;
+		case KDB_INIT_EARLY:
+			kdb_cmd_init();		/* Build kdb_cmds tables */
+			break;
+		}
+	}
+	kdb_init_lvl = lvl;
+}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
new file mode 100644
index 00000000000..be775f7e81e
--- /dev/null
+++ b/kernel/debug/kdb/kdb_private.h
@@ -0,0 +1,305 @@
+#ifndef _KDBPRIVATE_H
+#define _KDBPRIVATE_H
+
+/*
+ * Kernel Debugger Architecture Independent Private Headers
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+
+#include <linux/kgdb.h>
+#include "../debug_core.h"
+
+/* Kernel Debugger Error codes.  Must not overlap with command codes. */
+#define KDB_NOTFOUND	(-1)
+#define KDB_ARGCOUNT	(-2)
+#define KDB_BADWIDTH	(-3)
+#define KDB_BADRADIX	(-4)
+#define KDB_NOTENV	(-5)
+#define KDB_NOENVVALUE	(-6)
+#define KDB_NOTIMP	(-7)
+#define KDB_ENVFULL	(-8)
+#define KDB_ENVBUFFULL	(-9)
+#define KDB_TOOMANYBPT	(-10)
+#define KDB_TOOMANYDBREGS (-11)
+#define KDB_DUPBPT	(-12)
+#define KDB_BPTNOTFOUND	(-13)
+#define KDB_BADMODE	(-14)
+#define KDB_BADINT	(-15)
+#define KDB_INVADDRFMT  (-16)
+#define KDB_BADREG      (-17)
+#define KDB_BADCPUNUM   (-18)
+#define KDB_BADLENGTH	(-19)
+#define KDB_NOBP	(-20)
+#define KDB_BADADDR	(-21)
+
+/* Kernel Debugger Command codes.  Must not overlap with error codes. */
+#define KDB_CMD_GO	(-1001)
+#define KDB_CMD_CPU	(-1002)
+#define KDB_CMD_SS	(-1003)
+#define KDB_CMD_SSB	(-1004)
+#define KDB_CMD_KGDB (-1005)
+#define KDB_CMD_KGDB2 (-1006)
+
+/* Internal debug flags */
+#define KDB_DEBUG_FLAG_BP	0x0002	/* Breakpoint subsystem debug */
+#define KDB_DEBUG_FLAG_BB_SUMM	0x0004	/* Basic block analysis, summary only */
+#define KDB_DEBUG_FLAG_AR	0x0008	/* Activation record, generic */
+#define KDB_DEBUG_FLAG_ARA	0x0010	/* Activation record, arch specific */
+#define KDB_DEBUG_FLAG_BB	0x0020	/* All basic block analysis */
+#define KDB_DEBUG_FLAG_STATE	0x0040	/* State flags */
+#define KDB_DEBUG_FLAG_MASK	0xffff	/* All debug flags */
+#define KDB_DEBUG_FLAG_SHIFT	16	/* Shift factor for dbflags */
+
+#define KDB_DEBUG(flag)	(kdb_flags & \
+	(KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
+#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
+		kdb_print_state(text, value)
+
+#if BITS_PER_LONG == 32
+
+#define KDB_PLATFORM_ENV	"BYTESPERWORD=4"
+
+#define kdb_machreg_fmt		"0x%lx"
+#define kdb_machreg_fmt0	"0x%08lx"
+#define kdb_bfd_vma_fmt		"0x%lx"
+#define kdb_bfd_vma_fmt0	"0x%08lx"
+#define kdb_elfw_addr_fmt	"0x%x"
+#define kdb_elfw_addr_fmt0	"0x%08x"
+#define kdb_f_count_fmt		"%d"
+
+#elif BITS_PER_LONG == 64
+
+#define KDB_PLATFORM_ENV	"BYTESPERWORD=8"
+
+#define kdb_machreg_fmt		"0x%lx"
+#define kdb_machreg_fmt0	"0x%016lx"
+#define kdb_bfd_vma_fmt		"0x%lx"
+#define kdb_bfd_vma_fmt0	"0x%016lx"
+#define kdb_elfw_addr_fmt	"0x%x"
+#define kdb_elfw_addr_fmt0	"0x%016x"
+#define kdb_f_count_fmt		"%ld"
+
+#endif
+
+/*
+ * KDB_MAXBPT describes the total number of breakpoints
+ * supported by this architecure.
+ */
+#define KDB_MAXBPT	16
+
+/* Maximum number of arguments to a function  */
+#define KDB_MAXARGS    16
+
+typedef enum {
+	KDB_REPEAT_NONE = 0,	/* Do not repeat this command */
+	KDB_REPEAT_NO_ARGS,	/* Repeat the command without arguments */
+	KDB_REPEAT_WITH_ARGS,	/* Repeat the command including its arguments */
+} kdb_repeat_t;
+
+typedef int (*kdb_func_t)(int, const char **);
+
+/* Symbol table format returned by kallsyms. */
+typedef struct __ksymtab {
+		unsigned long value;	/* Address of symbol */
+		const char *mod_name;	/* Module containing symbol or
+					 * "kernel" */
+		unsigned long mod_start;
+		unsigned long mod_end;
+		const char *sec_name;	/* Section containing symbol */
+		unsigned long sec_start;
+		unsigned long sec_end;
+		const char *sym_name;	/* Full symbol name, including
+					 * any version */
+		unsigned long sym_start;
+		unsigned long sym_end;
+		} kdb_symtab_t;
+extern int kallsyms_symbol_next(char *prefix_name, int flag);
+extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
+
+/* Exported Symbols for kernel loadable modules to use. */
+extern int kdb_register(char *, kdb_func_t, char *, char *, short);
+extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
+			       short, kdb_repeat_t);
+extern int kdb_unregister(char *);
+
+extern int kdb_getarea_size(void *, unsigned long, size_t);
+extern int kdb_putarea_size(unsigned long, void *, size_t);
+
+/*
+ * Like get_user and put_user, kdb_getarea and kdb_putarea take variable
+ * names, not pointers.  The underlying *_size functions take pointers.
+ */
+#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
+#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
+
+extern int kdb_getphysword(unsigned long *word,
+			unsigned long addr, size_t size);
+extern int kdb_getword(unsigned long *, unsigned long, size_t);
+extern int kdb_putword(unsigned long, unsigned long, size_t);
+
+extern int kdbgetularg(const char *, unsigned long *);
+extern char *kdbgetenv(const char *);
+extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
+			 long *, char **);
+extern int kdbgetsymval(const char *, kdb_symtab_t *);
+extern int kdbnearsym(unsigned long, kdb_symtab_t *);
+extern void kdbnearsym_cleanup(void);
+extern char *kdb_strdup(const char *str, gfp_t type);
+extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
+
+/* Routine for debugging the debugger state. */
+extern void kdb_print_state(const char *, int);
+
+extern int kdb_state;
+#define KDB_STATE_KDB		0x00000001	/* Cpu is inside kdb */
+#define KDB_STATE_LEAVING	0x00000002	/* Cpu is leaving kdb */
+#define KDB_STATE_CMD		0x00000004	/* Running a kdb command */
+#define KDB_STATE_KDB_CONTROL	0x00000008	/* This cpu is under
+						 * kdb control */
+#define KDB_STATE_HOLD_CPU	0x00000010	/* Hold this cpu inside kdb */
+#define KDB_STATE_DOING_SS	0x00000020	/* Doing ss command */
+#define KDB_STATE_DOING_SSB	0x00000040	/* Doing ssb command,
+						 * DOING_SS is also set */
+#define KDB_STATE_SSBPT		0x00000080	/* Install breakpoint
+						 * after one ss, independent of
+						 * DOING_SS */
+#define KDB_STATE_REENTRY	0x00000100	/* Valid re-entry into kdb */
+#define KDB_STATE_SUPPRESS	0x00000200	/* Suppress error messages */
+#define KDB_STATE_PAGER		0x00000400	/* pager is available */
+#define KDB_STATE_GO_SWITCH	0x00000800	/* go is switching
+						 * back to initial cpu */
+#define KDB_STATE_PRINTF_LOCK	0x00001000	/* Holds kdb_printf lock */
+#define KDB_STATE_WAIT_IPI	0x00002000	/* Waiting for kdb_ipi() NMI */
+#define KDB_STATE_RECURSE	0x00004000	/* Recursive entry to kdb */
+#define KDB_STATE_IP_ADJUSTED	0x00008000	/* Restart IP has been
+						 * adjusted */
+#define KDB_STATE_GO1		0x00010000	/* go only releases one cpu */
+#define KDB_STATE_KEYBOARD	0x00020000	/* kdb entered via
+						 * keyboard on this cpu */
+#define KDB_STATE_KEXEC		0x00040000	/* kexec issued */
+#define KDB_STATE_DOING_KGDB	0x00080000	/* kgdb enter now issued */
+#define KDB_STATE_DOING_KGDB2	0x00100000	/* kgdb enter now issued */
+#define KDB_STATE_KGDB_TRANS	0x00200000	/* Transition to kgdb */
+#define KDB_STATE_ARCH		0xff000000	/* Reserved for arch
+						 * specific use */
+
+#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
+#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
+#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
+
+extern int kdb_nextline; /* Current number of lines displayed */
+
+typedef struct _kdb_bp {
+	unsigned long	bp_addr;	/* Address breakpoint is present at */
+	unsigned int	bp_free:1;	/* This entry is available */
+	unsigned int	bp_enabled:1;	/* Breakpoint is active in register */
+	unsigned int	bp_type:4;	/* Uses hardware register */
+	unsigned int	bp_installed:1;	/* Breakpoint is installed */
+	unsigned int	bp_delay:1;	/* Do delayed bp handling */
+	unsigned int	bp_delayed:1;	/* Delayed breakpoint */
+	unsigned int	bph_length;	/* HW break length */
+} kdb_bp_t;
+
+#ifdef CONFIG_KGDB_KDB
+extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
+
+/* The KDB shell command table */
+typedef struct _kdbtab {
+	char    *cmd_name;		/* Command name */
+	kdb_func_t cmd_func;		/* Function to execute command */
+	char    *cmd_usage;		/* Usage String for this command */
+	char    *cmd_help;		/* Help message for this command */
+	short    cmd_flags;		/* Parsing flags */
+	short    cmd_minlen;		/* Minimum legal # command
+					 * chars required */
+	kdb_repeat_t cmd_repeat;	/* Does command auto repeat on enter? */
+} kdbtab_t;
+
+extern int kdb_bt(int, const char **);	/* KDB display back trace */
+
+/* KDB breakpoint management functions */
+extern void kdb_initbptab(void);
+extern void kdb_bp_install(struct pt_regs *);
+extern void kdb_bp_remove(void);
+
+typedef enum {
+	KDB_DB_BPT,	/* Breakpoint */
+	KDB_DB_SS,	/* Single-step trap */
+	KDB_DB_SSB,	/* Single step to branch */
+	KDB_DB_SSBPT,	/* Single step over breakpoint */
+	KDB_DB_NOBPT	/* Spurious breakpoint */
+} kdb_dbtrap_t;
+
+extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
+			 int, kdb_dbtrap_t, struct pt_regs *);
+
+/* Miscellaneous functions and data areas */
+extern int kdb_grepping_flag;
+extern char kdb_grep_string[];
+extern int kdb_grep_leading;
+extern int kdb_grep_trailing;
+extern char *kdb_cmds[];
+extern void kdb_syslog_data(char *syslog_data[]);
+extern unsigned long kdb_task_state_string(const char *);
+extern char kdb_task_state_char (const struct task_struct *);
+extern unsigned long kdb_task_state(const struct task_struct *p,
+				    unsigned long mask);
+extern void kdb_ps_suppressed(void);
+extern void kdb_ps1(const struct task_struct *p);
+extern void kdb_print_nameval(const char *name, unsigned long val);
+extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
+extern void kdb_meminfo_proc_show(void);
+#ifdef CONFIG_KALLSYMS
+extern const char *kdb_walk_kallsyms(loff_t *pos);
+#else /* ! CONFIG_KALLSYMS */
+static inline const char *kdb_walk_kallsyms(loff_t *pos)
+{
+	return NULL;
+}
+#endif /* ! CONFIG_KALLSYMS */
+extern char *kdb_getstr(char *, size_t, char *);
+
+/* Defines for kdb_symbol_print */
+#define KDB_SP_SPACEB	0x0001		/* Space before string */
+#define KDB_SP_SPACEA	0x0002		/* Space after string */
+#define KDB_SP_PAREN	0x0004		/* Parenthesis around string */
+#define KDB_SP_VALUE	0x0008		/* Print the value of the address */
+#define KDB_SP_SYMSIZE	0x0010		/* Print the size of the symbol */
+#define KDB_SP_NEWLINE	0x0020		/* Newline after string */
+#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
+
+#define KDB_TSK(cpu) kgdb_info[cpu].task
+#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
+
+extern struct task_struct *kdb_curr_task(int);
+
+#define kdb_task_has_cpu(p) (task_curr(p))
+
+/* Simplify coexistence with NPTL */
+#define	kdb_do_each_thread(g, p) do_each_thread(g, p)
+#define	kdb_while_each_thread(g, p) while_each_thread(g, p)
+
+#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
+
+extern void *debug_kmalloc(size_t size, gfp_t flags);
+extern void debug_kfree(void *);
+extern void debug_kusage(void);
+
+extern void kdb_set_current_task(struct task_struct *);
+extern struct task_struct *kdb_current_task;
+#ifdef CONFIG_MODULES
+extern struct list_head *kdb_modules;
+#endif /* CONFIG_MODULES */
+
+extern char kdb_prompt_str[];
+
+#define	KDB_WORD_SIZE	((int)sizeof(unsigned long))
+
+#endif /* CONFIG_KGDB_KDB */
+#endif	/* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
new file mode 100644
index 00000000000..6b2485dcb05
--- /dev/null
+++ b/kernel/debug/kdb/kdb_support.c
@@ -0,0 +1,927 @@
+/*
+ * Kernel Debugger Architecture Independent Support Functions
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ * 03/02/13    added new 2.5 kallsyms <xavier.bru@bull.net>
+ */
+
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kallsyms.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/ptrace.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <linux/kdb.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+
+/*
+ * kdbgetsymval - Return the address of the given symbol.
+ *
+ * Parameters:
+ *	symname	Character string containing symbol name
+ *      symtab  Structure to receive results
+ * Returns:
+ *	0	Symbol not found, symtab zero filled
+ *	1	Symbol mapped to module/symbol/section, data in symtab
+ */
+int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
+{
+	if (KDB_DEBUG(AR))
+		kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
+			   symtab);
+	memset(symtab, 0, sizeof(*symtab));
+	symtab->sym_start = kallsyms_lookup_name(symname);
+	if (symtab->sym_start) {
+		if (KDB_DEBUG(AR))
+			kdb_printf("kdbgetsymval: returns 1, "
+				   "symtab->sym_start=0x%lx\n",
+				   symtab->sym_start);
+		return 1;
+	}
+	if (KDB_DEBUG(AR))
+		kdb_printf("kdbgetsymval: returns 0\n");
+	return 0;
+}
+EXPORT_SYMBOL(kdbgetsymval);
+
+static char *kdb_name_table[100];	/* arbitrary size */
+
+/*
+ * kdbnearsym -	Return the name of the symbol with the nearest address
+ *	less than 'addr'.
+ *
+ * Parameters:
+ *	addr	Address to check for symbol near
+ *	symtab  Structure to receive results
+ * Returns:
+ *	0	No sections contain this address, symtab zero filled
+ *	1	Address mapped to module/symbol/section, data in symtab
+ * Remarks:
+ *	2.6 kallsyms has a "feature" where it unpacks the name into a
+ *	string.  If that string is reused before the caller expects it
+ *	then the caller sees its string change without warning.  To
+ *	avoid cluttering up the main kdb code with lots of kdb_strdup,
+ *	tests and kfree calls, kdbnearsym maintains an LRU list of the
+ *	last few unique strings.  The list is sized large enough to
+ *	hold active strings, no kdb caller of kdbnearsym makes more
+ *	than ~20 later calls before using a saved value.
+ */
+int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
+{
+	int ret = 0;
+	unsigned long symbolsize = 0;
+	unsigned long offset = 0;
+#define knt1_size 128		/* must be >= kallsyms table size */
+	char *knt1 = NULL;
+
+	if (KDB_DEBUG(AR))
+		kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
+	memset(symtab, 0, sizeof(*symtab));
+
+	if (addr < 4096)
+		goto out;
+	knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
+	if (!knt1) {
+		kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
+			   addr);
+		goto out;
+	}
+	symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
+				(char **)(&symtab->mod_name), knt1);
+	if (offset > 8*1024*1024) {
+		symtab->sym_name = NULL;
+		addr = offset = symbolsize = 0;
+	}
+	symtab->sym_start = addr - offset;
+	symtab->sym_end = symtab->sym_start + symbolsize;
+	ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
+
+	if (ret) {
+		int i;
+		/* Another 2.6 kallsyms "feature".  Sometimes the sym_name is
+		 * set but the buffer passed into kallsyms_lookup is not used,
+		 * so it contains garbage.  The caller has to work out which
+		 * buffer needs to be saved.
+		 *
+		 * What was Rusty smoking when he wrote that code?
+		 */
+		if (symtab->sym_name != knt1) {
+			strncpy(knt1, symtab->sym_name, knt1_size);
+			knt1[knt1_size-1] = '\0';
+		}
+		for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+			if (kdb_name_table[i] &&
+			    strcmp(kdb_name_table[i], knt1) == 0)
+				break;
+		}
+		if (i >= ARRAY_SIZE(kdb_name_table)) {
+			debug_kfree(kdb_name_table[0]);
+			memcpy(kdb_name_table, kdb_name_table+1,
+			       sizeof(kdb_name_table[0]) *
+			       (ARRAY_SIZE(kdb_name_table)-1));
+		} else {
+			debug_kfree(knt1);
+			knt1 = kdb_name_table[i];
+			memcpy(kdb_name_table+i, kdb_name_table+i+1,
+			       sizeof(kdb_name_table[0]) *
+			       (ARRAY_SIZE(kdb_name_table)-i-1));
+		}
+		i = ARRAY_SIZE(kdb_name_table) - 1;
+		kdb_name_table[i] = knt1;
+		symtab->sym_name = kdb_name_table[i];
+		knt1 = NULL;
+	}
+
+	if (symtab->mod_name == NULL)
+		symtab->mod_name = "kernel";
+	if (KDB_DEBUG(AR))
+		kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
+		   "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
+		   symtab->sym_start, symtab->mod_name, symtab->sym_name,
+		   symtab->sym_name);
+
+out:
+	debug_kfree(knt1);
+	return ret;
+}
+
+void kdbnearsym_cleanup(void)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+		if (kdb_name_table[i]) {
+			debug_kfree(kdb_name_table[i]);
+			kdb_name_table[i] = NULL;
+		}
+	}
+}
+
+static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
+
+/*
+ * kallsyms_symbol_complete
+ *
+ * Parameters:
+ *	prefix_name	prefix of a symbol name to lookup
+ *	max_len		maximum length that can be returned
+ * Returns:
+ *	Number of symbols which match the given prefix.
+ * Notes:
+ *	prefix_name is changed to contain the longest unique prefix that
+ *	starts with this prefix (tab completion).
+ */
+int kallsyms_symbol_complete(char *prefix_name, int max_len)
+{
+	loff_t pos = 0;
+	int prefix_len = strlen(prefix_name), prev_len = 0;
+	int i, number = 0;
+	const char *name;
+
+	while ((name = kdb_walk_kallsyms(&pos))) {
+		if (strncmp(name, prefix_name, prefix_len) == 0) {
+			strcpy(ks_namebuf, name);
+			/* Work out the longest name that matches the prefix */
+			if (++number == 1) {
+				prev_len = min_t(int, max_len-1,
+						 strlen(ks_namebuf));
+				memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
+				ks_namebuf_prev[prev_len] = '\0';
+				continue;
+			}
+			for (i = 0; i < prev_len; i++) {
+				if (ks_namebuf[i] != ks_namebuf_prev[i]) {
+					prev_len = i;
+					ks_namebuf_prev[i] = '\0';
+					break;
+				}
+			}
+		}
+	}
+	if (prev_len > prefix_len)
+		memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
+	return number;
+}
+
+/*
+ * kallsyms_symbol_next
+ *
+ * Parameters:
+ *	prefix_name	prefix of a symbol name to lookup
+ *	flag	0 means search from the head, 1 means continue search.
+ * Returns:
+ *	1 if a symbol matches the given prefix.
+ *	0 if no string found
+ */
+int kallsyms_symbol_next(char *prefix_name, int flag)
+{
+	int prefix_len = strlen(prefix_name);
+	static loff_t pos;
+	const char *name;
+
+	if (!flag)
+		pos = 0;
+
+	while ((name = kdb_walk_kallsyms(&pos))) {
+		if (strncmp(name, prefix_name, prefix_len) == 0) {
+			strncpy(prefix_name, name, strlen(name)+1);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * kdb_symbol_print - Standard method for printing a symbol name and offset.
+ * Inputs:
+ *	addr	Address to be printed.
+ *	symtab	Address of symbol data, if NULL this routine does its
+ *		own lookup.
+ *	punc	Punctuation for string, bit field.
+ * Remarks:
+ *	The string and its punctuation is only printed if the address
+ *	is inside the kernel, except that the value is always printed
+ *	when requested.
+ */
+void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
+		      unsigned int punc)
+{
+	kdb_symtab_t symtab, *symtab_p2;
+	if (symtab_p) {
+		symtab_p2 = (kdb_symtab_t *)symtab_p;
+	} else {
+		symtab_p2 = &symtab;
+		kdbnearsym(addr, symtab_p2);
+	}
+	if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
+		return;
+	if (punc & KDB_SP_SPACEB)
+		kdb_printf(" ");
+	if (punc & KDB_SP_VALUE)
+		kdb_printf(kdb_machreg_fmt0, addr);
+	if (symtab_p2->sym_name) {
+		if (punc & KDB_SP_VALUE)
+			kdb_printf(" ");
+		if (punc & KDB_SP_PAREN)
+			kdb_printf("(");
+		if (strcmp(symtab_p2->mod_name, "kernel"))
+			kdb_printf("[%s]", symtab_p2->mod_name);
+		kdb_printf("%s", symtab_p2->sym_name);
+		if (addr != symtab_p2->sym_start)
+			kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
+		if (punc & KDB_SP_SYMSIZE)
+			kdb_printf("/0x%lx",
+				   symtab_p2->sym_end - symtab_p2->sym_start);
+		if (punc & KDB_SP_PAREN)
+			kdb_printf(")");
+	}
+	if (punc & KDB_SP_SPACEA)
+		kdb_printf(" ");
+	if (punc & KDB_SP_NEWLINE)
+		kdb_printf("\n");
+}
+
+/*
+ * kdb_strdup - kdb equivalent of strdup, for disasm code.
+ * Inputs:
+ *	str	The string to duplicate.
+ *	type	Flags to kmalloc for the new string.
+ * Returns:
+ *	Address of the new string, NULL if storage could not be allocated.
+ * Remarks:
+ *	This is not in lib/string.c because it uses kmalloc which is not
+ *	available when string.o is used in boot loaders.
+ */
+char *kdb_strdup(const char *str, gfp_t type)
+{
+	int n = strlen(str)+1;
+	char *s = kmalloc(n, type);
+	if (!s)
+		return NULL;
+	return strcpy(s, str);
+}
+
+/*
+ * kdb_getarea_size - Read an area of data.  The kdb equivalent of
+ *	copy_from_user, with kdb messages for invalid addresses.
+ * Inputs:
+ *	res	Pointer to the area to receive the result.
+ *	addr	Address of the area to copy.
+ *	size	Size of the area.
+ * Returns:
+ *	0 for success, < 0 for error.
+ */
+int kdb_getarea_size(void *res, unsigned long addr, size_t size)
+{
+	int ret = probe_kernel_read((char *)res, (char *)addr, size);
+	if (ret) {
+		if (!KDB_STATE(SUPPRESS)) {
+			kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
+			KDB_STATE_SET(SUPPRESS);
+		}
+		ret = KDB_BADADDR;
+	} else {
+		KDB_STATE_CLEAR(SUPPRESS);
+	}
+	return ret;
+}
+
+/*
+ * kdb_putarea_size - Write an area of data.  The kdb equivalent of
+ *	copy_to_user, with kdb messages for invalid addresses.
+ * Inputs:
+ *	addr	Address of the area to write to.
+ *	res	Pointer to the area holding the data.
+ *	size	Size of the area.
+ * Returns:
+ *	0 for success, < 0 for error.
+ */
+int kdb_putarea_size(unsigned long addr, void *res, size_t size)
+{
+	int ret = probe_kernel_read((char *)addr, (char *)res, size);
+	if (ret) {
+		if (!KDB_STATE(SUPPRESS)) {
+			kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
+			KDB_STATE_SET(SUPPRESS);
+		}
+		ret = KDB_BADADDR;
+	} else {
+		KDB_STATE_CLEAR(SUPPRESS);
+	}
+	return ret;
+}
+
+/*
+ * kdb_getphys - Read data from a physical address. Validate the
+ * 	address is in range, use kmap_atomic() to get data
+ * 	similar to kdb_getarea() - but for phys addresses
+ * Inputs:
+ * 	res	Pointer to the word to receive the result
+ * 	addr	Physical address of the area to copy
+ * 	size	Size of the area
+ * Returns:
+ *	0 for success, < 0 for error.
+ */
+static int kdb_getphys(void *res, unsigned long addr, size_t size)
+{
+	unsigned long pfn;
+	void *vaddr;
+	struct page *page;
+
+	pfn = (addr >> PAGE_SHIFT);
+	if (!pfn_valid(pfn))
+		return 1;
+	page = pfn_to_page(pfn);
+	vaddr = kmap_atomic(page, KM_KDB);
+	memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
+	kunmap_atomic(vaddr, KM_KDB);
+
+	return 0;
+}
+
+/*
+ * kdb_getphysword
+ * Inputs:
+ *	word	Pointer to the word to receive the result.
+ *	addr	Address of the area to copy.
+ *	size	Size of the area.
+ * Returns:
+ *	0 for success, < 0 for error.
+ */
+int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
+{
+	int diag;
+	__u8  w1;
+	__u16 w2;
+	__u32 w4;
+	__u64 w8;
+	*word = 0;	/* Default value if addr or size is invalid */
+
+	switch (size) {
+	case 1:
+		diag = kdb_getphys(&w1, addr, sizeof(w1));
+		if (!diag)
+			*word = w1;
+		break;
+	case 2:
+		diag = kdb_getphys(&w2, addr, sizeof(w2));
+		if (!diag)
+			*word = w2;
+		break;
+	case 4:
+		diag = kdb_getphys(&w4, addr, sizeof(w4));
+		if (!diag)
+			*word = w4;
+		break;
+	case 8:
+		if (size <= sizeof(*word)) {
+			diag = kdb_getphys(&w8, addr, sizeof(w8));
+			if (!diag)
+				*word = w8;
+			break;
+		}
+		/* drop through */
+	default:
+		diag = KDB_BADWIDTH;
+		kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
+	}
+	return diag;
+}
+
+/*
+ * kdb_getword - Read a binary value.  Unlike kdb_getarea, this treats
+ *	data as numbers.
+ * Inputs:
+ *	word	Pointer to the word to receive the result.
+ *	addr	Address of the area to copy.
+ *	size	Size of the area.
+ * Returns:
+ *	0 for success, < 0 for error.
+ */
+int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
+{
+	int diag;
+	__u8  w1;
+	__u16 w2;
+	__u32 w4;
+	__u64 w8;
+	*word = 0;	/* Default value if addr or size is invalid */
+	switch (size) {
+	case 1:
+		diag = kdb_getarea(w1, addr);
+		if (!diag)
+			*word = w1;
+		break;
+	case 2:
+		diag = kdb_getarea(w2, addr);
+		if (!diag)
+			*word = w2;
+		break;
+	case 4:
+		diag = kdb_getarea(w4, addr);
+		if (!diag)
+			*word = w4;
+		break;
+	case 8:
+		if (size <= sizeof(*word)) {
+			diag = kdb_getarea(w8, addr);
+			if (!diag)
+				*word = w8;
+			break;
+		}
+		/* drop through */
+	default:
+		diag = KDB_BADWIDTH;
+		kdb_printf("kdb_getword: bad width %ld\n", (long) size);
+	}
+	return diag;
+}
+
+/*
+ * kdb_putword - Write a binary value.  Unlike kdb_putarea, this
+ *	treats data as numbers.
+ * Inputs:
+ *	addr	Address of the area to write to..
+ *	word	The value to set.
+ *	size	Size of the area.
+ * Returns:
+ *	0 for success, < 0 for error.
+ */
+int kdb_putword(unsigned long addr, unsigned long word, size_t size)
+{
+	int diag;
+	__u8  w1;
+	__u16 w2;
+	__u32 w4;
+	__u64 w8;
+	switch (size) {
+	case 1:
+		w1 = word;
+		diag = kdb_putarea(addr, w1);
+		break;
+	case 2:
+		w2 = word;
+		diag = kdb_putarea(addr, w2);
+		break;
+	case 4:
+		w4 = word;
+		diag = kdb_putarea(addr, w4);
+		break;
+	case 8:
+		if (size <= sizeof(word)) {
+			w8 = word;
+			diag = kdb_putarea(addr, w8);
+			break;
+		}
+		/* drop through */
+	default:
+		diag = KDB_BADWIDTH;
+		kdb_printf("kdb_putword: bad width %ld\n", (long) size);
+	}
+	return diag;
+}
+
+/*
+ * kdb_task_state_string - Convert a string containing any of the
+ *	letters DRSTCZEUIMA to a mask for the process state field and
+ *	return the value.  If no argument is supplied, return the mask
+ *	that corresponds to environment variable PS, DRSTCZEU by
+ *	default.
+ * Inputs:
+ *	s	String to convert
+ * Returns:
+ *	Mask for process state.
+ * Notes:
+ *	The mask folds data from several sources into a single long value, so
+ *	be carefull not to overlap the bits.  TASK_* bits are in the LSB,
+ *	special cases like UNRUNNABLE are in the MSB.  As of 2.6.10-rc1 there
+ *	is no overlap between TASK_* and EXIT_* but that may not always be
+ *	true, so EXIT_* bits are shifted left 16 bits before being stored in
+ *	the mask.
+ */
+
+/* unrunnable is < 0 */
+#define UNRUNNABLE	(1UL << (8*sizeof(unsigned long) - 1))
+#define RUNNING		(1UL << (8*sizeof(unsigned long) - 2))
+#define IDLE		(1UL << (8*sizeof(unsigned long) - 3))
+#define DAEMON		(1UL << (8*sizeof(unsigned long) - 4))
+
+unsigned long kdb_task_state_string(const char *s)
+{
+	long res = 0;
+	if (!s) {
+		s = kdbgetenv("PS");
+		if (!s)
+			s = "DRSTCZEU";	/* default value for ps */
+	}
+	while (*s) {
+		switch (*s) {
+		case 'D':
+			res |= TASK_UNINTERRUPTIBLE;
+			break;
+		case 'R':
+			res |= RUNNING;
+			break;
+		case 'S':
+			res |= TASK_INTERRUPTIBLE;
+			break;
+		case 'T':
+			res |= TASK_STOPPED;
+			break;
+		case 'C':
+			res |= TASK_TRACED;
+			break;
+		case 'Z':
+			res |= EXIT_ZOMBIE << 16;
+			break;
+		case 'E':
+			res |= EXIT_DEAD << 16;
+			break;
+		case 'U':
+			res |= UNRUNNABLE;
+			break;
+		case 'I':
+			res |= IDLE;
+			break;
+		case 'M':
+			res |= DAEMON;
+			break;
+		case 'A':
+			res = ~0UL;
+			break;
+		default:
+			  kdb_printf("%s: unknown flag '%c' ignored\n",
+				     __func__, *s);
+			  break;
+		}
+		++s;
+	}
+	return res;
+}
+
+/*
+ * kdb_task_state_char - Return the character that represents the task state.
+ * Inputs:
+ *	p	struct task for the process
+ * Returns:
+ *	One character to represent the task state.
+ */
+char kdb_task_state_char (const struct task_struct *p)
+{
+	int cpu;
+	char state;
+	unsigned long tmp;
+
+	if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+		return 'E';
+
+	cpu = kdb_process_cpu(p);
+	state = (p->state == 0) ? 'R' :
+		(p->state < 0) ? 'U' :
+		(p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
+		(p->state & TASK_STOPPED) ? 'T' :
+		(p->state & TASK_TRACED) ? 'C' :
+		(p->exit_state & EXIT_ZOMBIE) ? 'Z' :
+		(p->exit_state & EXIT_DEAD) ? 'E' :
+		(p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+	if (p->pid == 0) {
+		/* Idle task.  Is it really idle, apart from the kdb
+		 * interrupt? */
+		if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
+			if (cpu != kdb_initial_cpu)
+				state = 'I';	/* idle task */
+		}
+	} else if (!p->mm && state == 'S') {
+		state = 'M';	/* sleeping system daemon */
+	}
+	return state;
+}
+
+/*
+ * kdb_task_state - Return true if a process has the desired state
+ *	given by the mask.
+ * Inputs:
+ *	p	struct task for the process
+ *	mask	mask from kdb_task_state_string to select processes
+ * Returns:
+ *	True if the process matches at least one criteria defined by the mask.
+ */
+unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
+{
+	char state[] = { kdb_task_state_char(p), '\0' };
+	return (mask & kdb_task_state_string(state)) != 0;
+}
+
+/*
+ * kdb_print_nameval - Print a name and its value, converting the
+ *	value to a symbol lookup if possible.
+ * Inputs:
+ *	name	field name to print
+ *	val	value of field
+ */
+void kdb_print_nameval(const char *name, unsigned long val)
+{
+	kdb_symtab_t symtab;
+	kdb_printf("  %-11.11s ", name);
+	if (kdbnearsym(val, &symtab))
+		kdb_symbol_print(val, &symtab,
+				 KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
+	else
+		kdb_printf("0x%lx\n", val);
+}
+
+/* Last ditch allocator for debugging, so we can still debug even when
+ * the GFP_ATOMIC pool has been exhausted.  The algorithms are tuned
+ * for space usage, not for speed.  One smallish memory pool, the free
+ * chain is always in ascending address order to allow coalescing,
+ * allocations are done in brute force best fit.
+ */
+
+struct debug_alloc_header {
+	u32 next;	/* offset of next header from start of pool */
+	u32 size;
+	void *caller;
+};
+
+/* The memory returned by this allocator must be aligned, which means
+ * so must the header size.  Do not assume that sizeof(struct
+ * debug_alloc_header) is a multiple of the alignment, explicitly
+ * calculate the overhead of this header, including the alignment.
+ * The rest of this code must not use sizeof() on any header or
+ * pointer to a header.
+ */
+#define dah_align 8
+#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
+
+static u64 debug_alloc_pool_aligned[256*1024/dah_align];	/* 256K pool */
+static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
+static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
+
+/* Locking is awkward.  The debug code is called from all contexts,
+ * including non maskable interrupts.  A normal spinlock is not safe
+ * in NMI context.  Try to get the debug allocator lock, if it cannot
+ * be obtained after a second then give up.  If the lock could not be
+ * previously obtained on this cpu then only try once.
+ *
+ * sparse has no annotation for "this function _sometimes_ acquires a
+ * lock", so fudge the acquire/release notation.
+ */
+static DEFINE_SPINLOCK(dap_lock);
+static int get_dap_lock(void)
+	__acquires(dap_lock)
+{
+	static int dap_locked = -1;
+	int count;
+	if (dap_locked == smp_processor_id())
+		count = 1;
+	else
+		count = 1000;
+	while (1) {
+		if (spin_trylock(&dap_lock)) {
+			dap_locked = -1;
+			return 1;
+		}
+		if (!count--)
+			break;
+		udelay(1000);
+	}
+	dap_locked = smp_processor_id();
+	__acquire(dap_lock);
+	return 0;
+}
+
+void *debug_kmalloc(size_t size, gfp_t flags)
+{
+	unsigned int rem, h_offset;
+	struct debug_alloc_header *best, *bestprev, *prev, *h;
+	void *p = NULL;
+	if (!get_dap_lock()) {
+		__release(dap_lock);	/* we never actually got it */
+		return NULL;
+	}
+	h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+	if (dah_first_call) {
+		h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
+		dah_first_call = 0;
+	}
+	size = ALIGN(size, dah_align);
+	prev = best = bestprev = NULL;
+	while (1) {
+		if (h->size >= size && (!best || h->size < best->size)) {
+			best = h;
+			bestprev = prev;
+			if (h->size == size)
+				break;
+		}
+		if (!h->next)
+			break;
+		prev = h;
+		h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
+	}
+	if (!best)
+		goto out;
+	rem = best->size - size;
+	/* The pool must always contain at least one header */
+	if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
+		goto out;
+	if (rem >= dah_overhead) {
+		best->size = size;
+		h_offset = ((char *)best - debug_alloc_pool) +
+			   dah_overhead + best->size;
+		h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
+		h->size = rem - dah_overhead;
+		h->next = best->next;
+	} else
+		h_offset = best->next;
+	best->caller = __builtin_return_address(0);
+	dah_used += best->size;
+	dah_used_max = max(dah_used, dah_used_max);
+	if (bestprev)
+		bestprev->next = h_offset;
+	else
+		dah_first = h_offset;
+	p = (char *)best + dah_overhead;
+	memset(p, POISON_INUSE, best->size - 1);
+	*((char *)p + best->size - 1) = POISON_END;
+out:
+	spin_unlock(&dap_lock);
+	return p;
+}
+
+void debug_kfree(void *p)
+{
+	struct debug_alloc_header *h;
+	unsigned int h_offset;
+	if (!p)
+		return;
+	if ((char *)p < debug_alloc_pool ||
+	    (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
+		kfree(p);
+		return;
+	}
+	if (!get_dap_lock()) {
+		__release(dap_lock);	/* we never actually got it */
+		return;		/* memory leak, cannot be helped */
+	}
+	h = (struct debug_alloc_header *)((char *)p - dah_overhead);
+	memset(p, POISON_FREE, h->size - 1);
+	*((char *)p + h->size - 1) = POISON_END;
+	h->caller = NULL;
+	dah_used -= h->size;
+	h_offset = (char *)h - debug_alloc_pool;
+	if (h_offset < dah_first) {
+		h->next = dah_first;
+		dah_first = h_offset;
+	} else {
+		struct debug_alloc_header *prev;
+		unsigned int prev_offset;
+		prev = (struct debug_alloc_header *)(debug_alloc_pool +
+						     dah_first);
+		while (1) {
+			if (!prev->next || prev->next > h_offset)
+				break;
+			prev = (struct debug_alloc_header *)
+				(debug_alloc_pool + prev->next);
+		}
+		prev_offset = (char *)prev - debug_alloc_pool;
+		if (prev_offset + dah_overhead + prev->size == h_offset) {
+			prev->size += dah_overhead + h->size;
+			memset(h, POISON_FREE, dah_overhead - 1);
+			*((char *)h + dah_overhead - 1) = POISON_END;
+			h = prev;
+			h_offset = prev_offset;
+		} else {
+			h->next = prev->next;
+			prev->next = h_offset;
+		}
+	}
+	if (h_offset + dah_overhead + h->size == h->next) {
+		struct debug_alloc_header *next;
+		next = (struct debug_alloc_header *)
+			(debug_alloc_pool + h->next);
+		h->size += dah_overhead + next->size;
+		h->next = next->next;
+		memset(next, POISON_FREE, dah_overhead - 1);
+		*((char *)next + dah_overhead - 1) = POISON_END;
+	}
+	spin_unlock(&dap_lock);
+}
+
+void debug_kusage(void)
+{
+	struct debug_alloc_header *h_free, *h_used;
+#ifdef	CONFIG_IA64
+	/* FIXME: using dah for ia64 unwind always results in a memory leak.
+	 * Fix that memory leak first, then set debug_kusage_one_time = 1 for
+	 * all architectures.
+	 */
+	static int debug_kusage_one_time;
+#else
+	static int debug_kusage_one_time = 1;
+#endif
+	if (!get_dap_lock()) {
+		__release(dap_lock);	/* we never actually got it */
+		return;
+	}
+	h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+	if (dah_first == 0 &&
+	    (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
+	     dah_first_call))
+		goto out;
+	if (!debug_kusage_one_time)
+		goto out;
+	debug_kusage_one_time = 0;
+	kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
+		   __func__, dah_first);
+	if (dah_first) {
+		h_used = (struct debug_alloc_header *)debug_alloc_pool;
+		kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
+			   h_used->size);
+	}
+	do {
+		h_used = (struct debug_alloc_header *)
+			  ((char *)h_free + dah_overhead + h_free->size);
+		kdb_printf("%s: h_used %p size %d caller %p\n",
+			   __func__, h_used, h_used->size, h_used->caller);
+		h_free = (struct debug_alloc_header *)
+			  (debug_alloc_pool + h_free->next);
+	} while (h_free->next);
+	h_used = (struct debug_alloc_header *)
+		  ((char *)h_free + dah_overhead + h_free->size);
+	if ((char *)h_used - debug_alloc_pool !=
+	    sizeof(debug_alloc_pool_aligned))
+		kdb_printf("%s: h_used %p size %d caller %p\n",
+			   __func__, h_used, h_used->size, h_used->caller);
+out:
+	spin_unlock(&dap_lock);
+}
+
+/* Maintain a small stack of kdb_flags to allow recursion without disturbing
+ * the global kdb state.
+ */
+
+static int kdb_flags_stack[4], kdb_flags_index;
+
+void kdb_save_flags(void)
+{
+	BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
+	kdb_flags_stack[kdb_flags_index++] = kdb_flags;
+}
+
+void kdb_restore_flags(void)
+{
+	BUG_ON(kdb_flags_index <= 0);
+	kdb_flags = kdb_flags_stack[--kdb_flags_index];
+}
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 00000000000..7bfae887f21
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,590 @@
+/*
+ * early_res, could be used to replace bootmem
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/early_res.h>
+#include <linux/slab.h>
+#include <linux/kmemleak.h>
+
+/*
+ * Early reserved memory areas.
+ */
+/*
+ * need to make sure this one is bigger enough before
+ * find_fw_memmap_area could be used
+ */
+#define MAX_EARLY_RES_X 32
+
+struct early_res {
+	u64 start, end;
+	char name[15];
+	char overlap_ok;
+};
+static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
+
+static int max_early_res __initdata = MAX_EARLY_RES_X;
+static struct early_res *early_res __initdata = &early_res_x[0];
+static int early_res_count __initdata;
+
+static int __init find_overlapped_early(u64 start, u64 end)
+{
+	int i;
+	struct early_res *r;
+
+	for (i = 0; i < max_early_res && early_res[i].end; i++) {
+		r = &early_res[i];
+		if (end > r->start && start < r->end)
+			break;
+	}
+
+	return i;
+}
+
+/*
+ * Drop the i-th range from the early reservation map,
+ * by copying any higher ranges down one over it, and
+ * clearing what had been the last slot.
+ */
+static void __init drop_range(int i)
+{
+	int j;
+
+	for (j = i + 1; j < max_early_res && early_res[j].end; j++)
+		;
+
+	memmove(&early_res[i], &early_res[i + 1],
+	       (j - 1 - i) * sizeof(struct early_res));
+
+	early_res[j - 1].end = 0;
+	early_res_count--;
+}
+
+static void __init drop_range_partial(int i, u64 start, u64 end)
+{
+	u64 common_start, common_end;
+	u64 old_start, old_end;
+
+	old_start = early_res[i].start;
+	old_end = early_res[i].end;
+	common_start = max(old_start, start);
+	common_end = min(old_end, end);
+
+	/* no overlap ? */
+	if (common_start >= common_end)
+		return;
+
+	if (old_start < common_start) {
+		/* make head segment */
+		early_res[i].end = common_start;
+		if (old_end > common_end) {
+			char name[15];
+
+			/*
+			 * Save a local copy of the name, since the
+			 * early_res array could get resized inside
+			 * reserve_early_without_check() ->
+			 * __check_and_double_early_res(), which would
+			 * make the current name pointer invalid.
+			 */
+			strncpy(name, early_res[i].name,
+					 sizeof(early_res[i].name) - 1);
+			/* add another for left over on tail */
+			reserve_early_without_check(common_end, old_end, name);
+		}
+		return;
+	} else {
+		if (old_end > common_end) {
+			/* reuse the entry for tail left */
+			early_res[i].start = common_end;
+			return;
+		}
+		/* all covered */
+		drop_range(i);
+	}
+}
+
+/*
+ * Split any existing ranges that:
+ *  1) are marked 'overlap_ok', and
+ *  2) overlap with the stated range [start, end)
+ * into whatever portion (if any) of the existing range is entirely
+ * below or entirely above the stated range.  Drop the portion
+ * of the existing range that overlaps with the stated range,
+ * which will allow the caller of this routine to then add that
+ * stated range without conflicting with any existing range.
+ */
+static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+{
+	int i;
+	struct early_res *r;
+	u64 lower_start, lower_end;
+	u64 upper_start, upper_end;
+	char name[15];
+
+	for (i = 0; i < max_early_res && early_res[i].end; i++) {
+		r = &early_res[i];
+
+		/* Continue past non-overlapping ranges */
+		if (end <= r->start || start >= r->end)
+			continue;
+
+		/*
+		 * Leave non-ok overlaps as is; let caller
+		 * panic "Overlapping early reservations"
+		 * when it hits this overlap.
+		 */
+		if (!r->overlap_ok)
+			return;
+
+		/*
+		 * We have an ok overlap.  We will drop it from the early
+		 * reservation map, and add back in any non-overlapping
+		 * portions (lower or upper) as separate, overlap_ok,
+		 * non-overlapping ranges.
+		 */
+
+		/* 1. Note any non-overlapping (lower or upper) ranges. */
+		strncpy(name, r->name, sizeof(name) - 1);
+
+		lower_start = lower_end = 0;
+		upper_start = upper_end = 0;
+		if (r->start < start) {
+			lower_start = r->start;
+			lower_end = start;
+		}
+		if (r->end > end) {
+			upper_start = end;
+			upper_end = r->end;
+		}
+
+		/* 2. Drop the original ok overlapping range */
+		drop_range(i);
+
+		i--;		/* resume for-loop on copied down entry */
+
+		/* 3. Add back in any non-overlapping ranges. */
+		if (lower_end)
+			reserve_early_overlap_ok(lower_start, lower_end, name);
+		if (upper_end)
+			reserve_early_overlap_ok(upper_start, upper_end, name);
+	}
+}
+
+static void __init __reserve_early(u64 start, u64 end, char *name,
+						int overlap_ok)
+{
+	int i;
+	struct early_res *r;
+
+	i = find_overlapped_early(start, end);
+	if (i >= max_early_res)
+		panic("Too many early reservations");
+	r = &early_res[i];
+	if (r->end)
+		panic("Overlapping early reservations "
+		      "%llx-%llx %s to %llx-%llx %s\n",
+		      start, end - 1, name ? name : "", r->start,
+		      r->end - 1, r->name);
+	r->start = start;
+	r->end = end;
+	r->overlap_ok = overlap_ok;
+	if (name)
+		strncpy(r->name, name, sizeof(r->name) - 1);
+	early_res_count++;
+}
+
+/*
+ * A few early reservtations come here.
+ *
+ * The 'overlap_ok' in the name of this routine does -not- mean it
+ * is ok for these reservations to overlap an earlier reservation.
+ * Rather it means that it is ok for subsequent reservations to
+ * overlap this one.
+ *
+ * Use this entry point to reserve early ranges when you are doing
+ * so out of "Paranoia", reserving perhaps more memory than you need,
+ * just in case, and don't mind a subsequent overlapping reservation
+ * that is known to be needed.
+ *
+ * The drop_overlaps_that_are_ok() call here isn't really needed.
+ * It would be needed if we had two colliding 'overlap_ok'
+ * reservations, so that the second such would not panic on the
+ * overlap with the first.  We don't have any such as of this
+ * writing, but might as well tolerate such if it happens in
+ * the future.
+ */
+void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
+{
+	drop_overlaps_that_are_ok(start, end);
+	__reserve_early(start, end, name, 1);
+}
+
+static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
+{
+	u64 start, end, size, mem;
+	struct early_res *new;
+
+	/* do we have enough slots left ? */
+	if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
+		return;
+
+	/* double it */
+	mem = -1ULL;
+	size = sizeof(struct early_res) * max_early_res * 2;
+	if (early_res == early_res_x)
+		start = 0;
+	else
+		start = early_res[0].end;
+	end = ex_start;
+	if (start + size < end)
+		mem = find_fw_memmap_area(start, end, size,
+					 sizeof(struct early_res));
+	if (mem == -1ULL) {
+		start = ex_end;
+		end = get_max_mapped();
+		if (start + size < end)
+			mem = find_fw_memmap_area(start, end, size,
+						 sizeof(struct early_res));
+	}
+	if (mem == -1ULL)
+		panic("can not find more space for early_res array");
+
+	new = __va(mem);
+	/* save the first one for own */
+	new[0].start = mem;
+	new[0].end = mem + size;
+	new[0].overlap_ok = 0;
+	/* copy old to new */
+	if (early_res == early_res_x) {
+		memcpy(&new[1], &early_res[0],
+			 sizeof(struct early_res) * max_early_res);
+		memset(&new[max_early_res+1], 0,
+			 sizeof(struct early_res) * (max_early_res - 1));
+		early_res_count++;
+	} else {
+		memcpy(&new[1], &early_res[1],
+			 sizeof(struct early_res) * (max_early_res - 1));
+		memset(&new[max_early_res], 0,
+			 sizeof(struct early_res) * max_early_res);
+	}
+	memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+	early_res = new;
+	max_early_res *= 2;
+	printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
+		max_early_res, mem, mem + size - 1);
+}
+
+/*
+ * Most early reservations come here.
+ *
+ * We first have drop_overlaps_that_are_ok() drop any pre-existing
+ * 'overlap_ok' ranges, so that we can then reserve this memory
+ * range without risk of panic'ing on an overlapping overlap_ok
+ * early reservation.
+ */
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+	if (start >= end)
+		return;
+
+	__check_and_double_early_res(start, end);
+
+	drop_overlaps_that_are_ok(start, end);
+	__reserve_early(start, end, name, 0);
+}
+
+void __init reserve_early_without_check(u64 start, u64 end, char *name)
+{
+	struct early_res *r;
+
+	if (start >= end)
+		return;
+
+	__check_and_double_early_res(start, end);
+
+	r = &early_res[early_res_count];
+
+	r->start = start;
+	r->end = end;
+	r->overlap_ok = 0;
+	if (name)
+		strncpy(r->name, name, sizeof(r->name) - 1);
+	early_res_count++;
+}
+
+void __init free_early(u64 start, u64 end)
+{
+	struct early_res *r;
+	int i;
+
+	kmemleak_free_part(__va(start), end - start);
+
+	i = find_overlapped_early(start, end);
+	r = &early_res[i];
+	if (i >= max_early_res || r->end != end || r->start != start)
+		panic("free_early on not reserved area: %llx-%llx!",
+			 start, end - 1);
+
+	drop_range(i);
+}
+
+void __init free_early_partial(u64 start, u64 end)
+{
+	struct early_res *r;
+	int i;
+
+	kmemleak_free_part(__va(start), end - start);
+
+	if (start == end)
+		return;
+
+	if (WARN_ONCE(start > end, "  wrong range [%#llx, %#llx]\n", start, end))
+		return;
+
+try_next:
+	i = find_overlapped_early(start, end);
+	if (i >= max_early_res)
+		return;
+
+	r = &early_res[i];
+	/* hole ? */
+	if (r->end >= end && r->start <= start) {
+		drop_range_partial(i, start, end);
+		return;
+	}
+
+	drop_range_partial(i, start, end);
+	goto try_next;
+}
+
+#ifdef CONFIG_NO_BOOTMEM
+static void __init subtract_early_res(struct range *range, int az)
+{
+	int i, count;
+	u64 final_start, final_end;
+	int idx = 0;
+
+	count  = 0;
+	for (i = 0; i < max_early_res && early_res[i].end; i++)
+		count++;
+
+	/* need to skip first one ?*/
+	if (early_res != early_res_x)
+		idx = 1;
+
+#define DEBUG_PRINT_EARLY_RES 1
+
+#if DEBUG_PRINT_EARLY_RES
+	printk(KERN_INFO "Subtract (%d early reservations)\n", count);
+#endif
+	for (i = idx; i < count; i++) {
+		struct early_res *r = &early_res[i];
+#if DEBUG_PRINT_EARLY_RES
+		printk(KERN_INFO "  #%d [%010llx - %010llx] %15s\n", i,
+			r->start, r->end, r->name);
+#endif
+		final_start = PFN_DOWN(r->start);
+		final_end = PFN_UP(r->end);
+		if (final_start >= final_end)
+			continue;
+		subtract_range(range, az, final_start, final_end);
+	}
+
+}
+
+int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+{
+	int i, count;
+	u64 start = 0, end;
+	u64 size;
+	u64 mem;
+	struct range *range;
+	int nr_range;
+
+	count  = 0;
+	for (i = 0; i < max_early_res && early_res[i].end; i++)
+		count++;
+
+	count *= 2;
+
+	size = sizeof(struct range) * count;
+	end = get_max_mapped();
+#ifdef MAX_DMA32_PFN
+	if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
+		start = MAX_DMA32_PFN << PAGE_SHIFT;
+#endif
+	mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
+	if (mem == -1ULL)
+		panic("can not find more space for range free");
+
+	range = __va(mem);
+	/* use early_node_map[] and early_res to get range array at first */
+	memset(range, 0, size);
+	nr_range = 0;
+
+	/* need to go over early_node_map to find out good range for node */
+	nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
+#ifdef CONFIG_X86_32
+	subtract_range(range, count, max_low_pfn, -1ULL);
+#endif
+	subtract_early_res(range, count);
+	nr_range = clean_sort_range(range, count);
+
+	/* need to clear it ? */
+	if (nodeid == MAX_NUMNODES) {
+		memset(&early_res[0], 0,
+			 sizeof(struct early_res) * max_early_res);
+		early_res = NULL;
+		max_early_res = 0;
+	}
+
+	*rangep = range;
+	return nr_range;
+}
+#else
+void __init early_res_to_bootmem(u64 start, u64 end)
+{
+	int i, count;
+	u64 final_start, final_end;
+	int idx = 0;
+
+	count  = 0;
+	for (i = 0; i < max_early_res && early_res[i].end; i++)
+		count++;
+
+	/* need to skip first one ?*/
+	if (early_res != early_res_x)
+		idx = 1;
+
+	printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+			 count - idx, max_early_res, start, end);
+	for (i = idx; i < count; i++) {
+		struct early_res *r = &early_res[i];
+		printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
+			r->start, r->end, r->name);
+		final_start = max(start, r->start);
+		final_end = min(end, r->end);
+		if (final_start >= final_end) {
+			printk(KERN_CONT "\n");
+			continue;
+		}
+		printk(KERN_CONT " ==> [%010llx - %010llx]\n",
+			final_start, final_end);
+		reserve_bootmem_generic(final_start, final_end - final_start,
+				BOOTMEM_DEFAULT);
+	}
+	/* clear them */
+	memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+	early_res = NULL;
+	max_early_res = 0;
+	early_res_count = 0;
+}
+#endif
+
+/* Check for already reserved areas */
+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+	int i;
+	u64 addr = *addrp;
+	int changed = 0;
+	struct early_res *r;
+again:
+	i = find_overlapped_early(addr, addr + size);
+	r = &early_res[i];
+	if (i < max_early_res && r->end) {
+		*addrp = addr = round_up(r->end, align);
+		changed = 1;
+		goto again;
+	}
+	return changed;
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+	int i;
+	u64 addr = *addrp, last;
+	u64 size = *sizep;
+	int changed = 0;
+again:
+	last = addr + size;
+	for (i = 0; i < max_early_res && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		if (last > r->start && addr < r->start) {
+			size = r->start - addr;
+			changed = 1;
+			goto again;
+		}
+		if (last > r->end && addr < r->end) {
+			addr = round_up(r->end, align);
+			size = last - addr;
+			changed = 1;
+			goto again;
+		}
+		if (last <= r->end && addr >= r->start) {
+			(*sizep)++;
+			return 0;
+		}
+	}
+	if (changed) {
+		*addrp = addr;
+		*sizep = size;
+	}
+	return changed;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
+ * only with the area.between start to end is active range from early_node_map
+ * so they are good as RAM
+ */
+u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+			 u64 size, u64 align)
+{
+	u64 addr, last;
+
+	addr = round_up(ei_start, align);
+	if (addr < start)
+		addr = round_up(start, align);
+	if (addr >= ei_last)
+		goto out;
+	while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+		;
+	last = addr + size;
+	if (last > ei_last)
+		goto out;
+	if (last > end)
+		goto out;
+
+	return addr;
+
+out:
+	return -1ULL;
+}
+
+u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
+			 u64 *sizep, u64 align)
+{
+	u64 addr, last;
+
+	addr = round_up(ei_start, align);
+	if (addr < start)
+		addr = round_up(start, align);
+	if (addr >= ei_last)
+		goto out;
+	*sizep = ei_last - addr;
+	while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
+		;
+	last = addr + *sizep;
+	if (last > ei_last)
+		goto out;
+
+	return addr;
+
+out:
+	return -1ULL;
+}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 00000000000..ff915efef66
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
+#include <linux/elf.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#include <asm/elf.h>
+
+
+Elf_Half __weak elf_core_extra_phdrs(void)
+{
+	return 0;
+}
+
+int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
+				      unsigned long limit)
+{
+	return 1;
+}
+
+int __weak elf_core_write_extra_data(struct file *file, size_t *size,
+				     unsigned long limit)
+{
+	return 1;
+}
+
+size_t __weak elf_core_extra_data_size(void)
+{
+	return 0;
+}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c35452cadde..0dbeae37422 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain;
 static DEFINE_RWLOCK(exec_domains_lock);
 
 
-static u_long ident_map[32] = {
+static unsigned long ident_map[32] = {
 	0,	1,	2,	3,	4,	5,	6,	7,
 	8,	9,	10,	11,	12,	13,	14,	15,
 	16,	17,	18,	19,	20,	21,	22,	23,
@@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp)
 }
 
 static struct exec_domain *
-lookup_exec_domain(u_long personality)
+lookup_exec_domain(unsigned int personality)
 {
-	struct exec_domain *	ep;
-	u_long			pers = personality(personality);
+	unsigned int pers = personality(personality);
+	struct exec_domain *ep;
 
 	read_lock(&exec_domains_lock);
 	for (ep = exec_domains; ep; ep = ep->next) {
@@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality)
 
 #ifdef CONFIG_MODULES
 	read_unlock(&exec_domains_lock);
-	request_module("personality-%ld", pers);
+	request_module("personality-%d", pers);
 	read_lock(&exec_domains_lock);
 
 	for (ep = exec_domains; ep; ep = ep->next) {
@@ -134,23 +134,14 @@ unregister:
 	return 0;
 }
 
-int
-__set_personality(u_long personality)
+int __set_personality(unsigned int personality)
 {
-	struct exec_domain	*ep, *oep;
-
-	ep = lookup_exec_domain(personality);
-	if (ep == current_thread_info()->exec_domain) {
-		current->personality = personality;
-		module_put(ep->module);
-		return 0;
-	}
+	struct exec_domain *oep = current_thread_info()->exec_domain;
 
+	current_thread_info()->exec_domain = lookup_exec_domain(personality);
 	current->personality = personality;
-	oep = current_thread_info()->exec_domain;
-	current_thread_info()->exec_domain = ep;
-
 	module_put(oep->module);
+
 	return 0;
 }
 
@@ -188,17 +179,14 @@ static int __init proc_execdomains_init(void)
 module_init(proc_execdomains_init);
 #endif
 
-SYSCALL_DEFINE1(personality, u_long, personality)
+SYSCALL_DEFINE1(personality, unsigned int, personality)
 {
-	u_long old = current->personality;
+	unsigned int old = current->personality;
 
-	if (personality != 0xffffffff) {
+	if (personality != 0xffffffff)
 		set_personality(personality);
-		if (current->personality != personality)
-			return -EINVAL;
-	}
 
-	return (long)old;
+	return old;
 }
 
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 546774a31a6..03120229db2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,15 +55,14 @@
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
-#include "cred-internals.h"
 
 static void exit_mm(struct task_struct * tsk);
 
-static void __unhash_process(struct task_struct *p)
+static void __unhash_process(struct task_struct *p, bool group_dead)
 {
 	nr_threads--;
 	detach_pid(p, PIDTYPE_PID);
-	if (thread_group_leader(p)) {
+	if (group_dead) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
 
@@ -80,23 +79,26 @@ static void __unhash_process(struct task_struct *p)
 static void __exit_signal(struct task_struct *tsk)
 {
 	struct signal_struct *sig = tsk->signal;
+	bool group_dead = thread_group_leader(tsk);
 	struct sighand_struct *sighand;
+	struct tty_struct *uninitialized_var(tty);
 
-	BUG_ON(!sig);
-	BUG_ON(!atomic_read(&sig->count));
-
-	sighand = rcu_dereference(tsk->sighand);
+	sighand = rcu_dereference_check(tsk->sighand,
+					rcu_read_lock_held() ||
+					lockdep_tasklist_lock_is_held());
 	spin_lock(&sighand->siglock);
 
 	posix_cpu_timers_exit(tsk);
-	if (atomic_dec_and_test(&sig->count))
+	if (group_dead) {
 		posix_cpu_timers_exit_group(tsk);
-	else {
+		tty = sig->tty;
+		sig->tty = NULL;
+	} else {
 		/*
 		 * If there is any task waiting for the group exit
 		 * then notify it:
 		 */
-		if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count)
+		if (sig->notify_count > 0 && !--sig->notify_count)
 			wake_up_process(sig->group_exit_task);
 
 		if (tsk == sig->curr_target)
@@ -122,32 +124,24 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->oublock += task_io_get_oublock(tsk);
 		task_io_accounting_add(&sig->ioac, &tsk->ioac);
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
-		sig = NULL; /* Marker for below. */
 	}
 
-	__unhash_process(tsk);
+	sig->nr_threads--;
+	__unhash_process(tsk, group_dead);
 
 	/*
 	 * Do this under ->siglock, we can race with another thread
 	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 	 */
 	flush_sigqueue(&tsk->pending);
-
-	tsk->signal = NULL;
 	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
 
 	__cleanup_sighand(sighand);
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
-	if (sig) {
+	if (group_dead) {
 		flush_sigqueue(&sig->shared_pending);
-		taskstats_tgid_free(sig);
-		/*
-		 * Make sure ->signal can't go away under rq->lock,
-		 * see account_group_exec_runtime().
-		 */
-		task_rq_unlock_wait(tsk);
-		__cleanup_signal(sig);
+		tty_kref_put(tty);
 	}
 }
 
@@ -170,8 +164,10 @@ void release_task(struct task_struct * p)
 repeat:
 	tracehook_prepare_release_task(p);
 	/* don't need to get the RCU readlock here - the process is dead and
-	 * can't be modifying its own credentials */
+	 * can't be modifying its own credentials. But shut RCU-lockdep up */
+	rcu_read_lock();
 	atomic_dec(&__task_cred(p)->user->processes);
+	rcu_read_unlock();
 
 	proc_flush_task(p);
 
@@ -473,9 +469,11 @@ static void close_files(struct files_struct * files)
 	/*
 	 * It is safe to dereference the fd table without RCU or
 	 * ->file_lock because this is the last reference to the
-	 * files structure.
+	 * files structure.  But use RCU to shut RCU-lockdep up.
 	 */
+	rcu_read_lock();
 	fdt = files_fdtable(files);
+	rcu_read_unlock();
 	for (;;) {
 		unsigned long set;
 		i = j * __NFDBITS;
@@ -521,10 +519,12 @@ void put_files_struct(struct files_struct *files)
 		 * at the end of the RCU grace period. Otherwise,
 		 * you can free files immediately.
 		 */
+		rcu_read_lock();
 		fdt = files_fdtable(files);
 		if (fdt != &files->fdtab)
 			kmem_cache_free(files_cachep, files);
 		free_fdtable(fdt);
+		rcu_read_unlock();
 	}
 }
 
@@ -771,9 +771,12 @@ static void forget_original_parent(struct task_struct *father)
 	struct task_struct *p, *n, *reaper;
 	LIST_HEAD(dead_children);
 
-	exit_ptrace(father);
-
 	write_lock_irq(&tasklist_lock);
+	/*
+	 * Note that exit_ptrace() and find_new_reaper() might
+	 * drop tasklist_lock and reacquire it.
+	 */
+	exit_ptrace(father);
 	reaper = find_new_reaper(father);
 
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
@@ -849,12 +852,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 
 	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
 
-	/* mt-exec, de_thread() is waiting for us */
-	if (thread_group_leader(tsk) &&
-	    tsk->signal->group_exit_task &&
-	    tsk->signal->notify_count < 0)
+	/* mt-exec, de_thread() is waiting for group leader */
+	if (unlikely(tsk->signal->notify_count < 0))
 		wake_up_process(tsk->signal->group_exit_task);
-
 	write_unlock_irq(&tasklist_lock);
 
 	tracehook_report_death(tsk, signal, cookie, group_dead);
@@ -944,7 +944,9 @@ NORET_TYPE void do_exit(long code)
 				preempt_count());
 
 	acct_update_integrals(tsk);
-
+	/* sync mm's RSS info before statistics gathering */
+	if (tsk->mm)
+		sync_mm_rss(tsk, tsk->mm);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		hrtimer_cancel(&tsk->signal->real_timer);
@@ -993,8 +995,10 @@ NORET_TYPE void do_exit(long code)
 
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
+	task_lock(tsk);
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
+	task_unlock(tsk);
 #endif
 #ifdef CONFIG_FUTEX
 	if (unlikely(current->pi_state_cache))
@@ -1180,7 +1184,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 
 	if (unlikely(wo->wo_flags & WNOWAIT)) {
 		int exit_code = p->exit_code;
-		int why, status;
+		int why;
 
 		get_task_struct(p);
 		read_unlock(&tasklist_lock);
@@ -1382,8 +1386,7 @@ static int wait_task_stopped(struct wait_opts *wo,
 	if (!unlikely(wo->wo_flags & WNOWAIT))
 		*p_code = 0;
 
-	/* don't need the RCU readlock here as we're holding a spinlock */
-	uid = __task_cred(p)->uid;
+	uid = task_uid(p);
 unlock_sig:
 	spin_unlock_irq(&p->sighand->siglock);
 	if (!exit_code)
@@ -1456,7 +1459,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	}
 	if (!unlikely(wo->wo_flags & WNOWAIT))
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
-	uid = __task_cred(p)->uid;
+	uid = task_uid(p);
 	spin_unlock_irq(&p->sighand->siglock);
 
 	pid = task_pid_vnr(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b2959b3ffc..c445f8cc408 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -87,6 +87,14 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
+#ifdef CONFIG_PROVE_RCU
+int lockdep_tasklist_lock_is_held(void)
+{
+	return lockdep_is_held(&tasklist_lock);
+}
+EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
 int nr_processes(void)
 {
 	int cpu;
@@ -157,6 +165,18 @@ void free_task(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(free_task);
 
+static inline void free_signal_struct(struct signal_struct *sig)
+{
+	taskstats_tgid_free(sig);
+	kmem_cache_free(signal_cachep, sig);
+}
+
+static inline void put_signal_struct(struct signal_struct *sig)
+{
+	if (atomic_dec_and_test(&sig->sigcnt))
+		free_signal_struct(sig);
+}
+
 void __put_task_struct(struct task_struct *tsk)
 {
 	WARN_ON(!tsk->exit_state);
@@ -165,6 +185,7 @@ void __put_task_struct(struct task_struct *tsk)
 
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
+	put_signal_struct(tsk->signal);
 
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
@@ -279,7 +300,7 @@ out:
 #ifdef CONFIG_MMU
 static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-	struct vm_area_struct *mpnt, *tmp, **pprev;
+	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
@@ -307,6 +328,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	if (retval)
 		goto out;
 
+	prev = NULL;
 	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
 		struct file *file;
 
@@ -328,15 +350,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
+		INIT_LIST_HEAD(&tmp->anon_vma_chain);
 		pol = mpol_dup(vma_policy(mpnt));
 		retval = PTR_ERR(pol);
 		if (IS_ERR(pol))
 			goto fail_nomem_policy;
 		vma_set_policy(tmp, pol);
-		tmp->vm_flags &= ~VM_LOCKED;
 		tmp->vm_mm = mm;
-		tmp->vm_next = NULL;
-		anon_vma_link(tmp);
+		if (anon_vma_fork(tmp, mpnt))
+			goto fail_nomem_anon_vma_fork;
+		tmp->vm_flags &= ~VM_LOCKED;
+		tmp->vm_next = tmp->vm_prev = NULL;
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file->f_path.dentry->d_inode;
@@ -369,6 +393,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		 */
 		*pprev = tmp;
 		pprev = &tmp->vm_next;
+		tmp->vm_prev = prev;
+		prev = tmp;
 
 		__vma_link_rb(mm, tmp, rb_link, rb_parent);
 		rb_link = &tmp->vm_rb.rb_right;
@@ -391,6 +417,8 @@ out:
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
 	return retval;
+fail_nomem_anon_vma_fork:
+	mpol_put(pol);
 fail_nomem_policy:
 	kmem_cache_free(vm_area_cachep, tmp);
 fail_nomem:
@@ -454,8 +482,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
 	mm->core_state = NULL;
 	mm->nr_ptes = 0;
-	set_mm_counter(mm, file_rss, 0);
-	set_mm_counter(mm, anon_rss, 0);
+	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
@@ -728,13 +755,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 	struct fs_struct *fs = current->fs;
 	if (clone_flags & CLONE_FS) {
 		/* tsk->fs is already what we want */
-		write_lock(&fs->lock);
+		spin_lock(&fs->lock);
 		if (fs->in_exec) {
-			write_unlock(&fs->lock);
+			spin_unlock(&fs->lock);
 			return -EAGAIN;
 		}
 		fs->users++;
-		write_unlock(&fs->lock);
+		spin_unlock(&fs->lock);
 		return 0;
 	}
 	tsk->fs = copy_fs_struct(fs);
@@ -824,23 +851,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
  */
 static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
+	unsigned long cpu_limit;
+
 	/* Thread group counters. */
 	thread_group_cputime_init(sig);
 
-	/* Expiration times and increments. */
-	sig->it[CPUCLOCK_PROF].expires = cputime_zero;
-	sig->it[CPUCLOCK_PROF].incr = cputime_zero;
-	sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
-	sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
-
-	/* Cached expiration times. */
-	sig->cputime_expires.prof_exp = cputime_zero;
-	sig->cputime_expires.virt_exp = cputime_zero;
-	sig->cputime_expires.sched_exp = 0;
-
-	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
-		sig->cputime_expires.prof_exp =
-			secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+	cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+	if (cpu_limit != RLIM_INFINITY) {
+		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
 		sig->cputimer.running = 1;
 	}
 
@@ -857,73 +875,43 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	if (clone_flags & CLONE_THREAD)
 		return 0;
 
-	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+	sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
 	tsk->signal = sig;
 	if (!sig)
 		return -ENOMEM;
 
-	atomic_set(&sig->count, 1);
+	sig->nr_threads = 1;
 	atomic_set(&sig->live, 1);
+	atomic_set(&sig->sigcnt, 1);
 	init_waitqueue_head(&sig->wait_chldexit);
-	sig->flags = 0;
 	if (clone_flags & CLONE_NEWPID)
 		sig->flags |= SIGNAL_UNKILLABLE;
-	sig->group_exit_code = 0;
-	sig->group_exit_task = NULL;
-	sig->group_stop_count = 0;
 	sig->curr_target = tsk;
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
 
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
 
-	sig->leader = 0;	/* session leadership doesn't inherit */
-	sig->tty_old_pgrp = NULL;
-	sig->tty = NULL;
-
-	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
-	sig->gtime = cputime_zero;
-	sig->cgtime = cputime_zero;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-	sig->prev_utime = sig->prev_stime = cputime_zero;
-#endif
-	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
-	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
-	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-	sig->maxrss = sig->cmaxrss = 0;
-	task_io_accounting_init(&sig->ioac);
-	sig->sum_sched_runtime = 0;
-	taskstats_tgid_init(sig);
-
 	task_lock(current->group_leader);
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
 	task_unlock(current->group_leader);
 
 	posix_cpu_timers_init_group(sig);
 
-	acct_init_pacct(&sig->pacct);
-
 	tty_audit_fork(sig);
 
 	sig->oom_adj = current->signal->oom_adj;
+	sig->oom_score_adj = current->signal->oom_score_adj;
 
 	return 0;
 }
 
-void __cleanup_signal(struct signal_struct *sig)
-{
-	thread_group_cputime_free(sig);
-	tty_kref_put(sig->tty);
-	kmem_cache_free(signal_cachep, sig);
-}
-
 static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long new_flags = p->flags;
 
-	new_flags &= ~PF_SUPERPRIV;
+	new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
 	new_flags |= PF_FORKNOEXEC;
 	new_flags |= PF_STARTING;
 	p->flags = new_flags;
@@ -1033,7 +1021,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
 	retval = -EAGAIN;
 	if (atomic_read(&p->real_cred->user->processes) >=
-			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
+			task_rlimit(p, RLIMIT_NPROC)) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
 		    p->real_cred->user != INIT_USER)
 			goto bad_fork_free;
@@ -1075,6 +1063,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->prev_utime = cputime_zero;
 	p->prev_stime = cputime_zero;
 #endif
+#if defined(SPLIT_RSS_COUNTING)
+	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
+#endif
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
@@ -1132,10 +1123,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->memcg_batch.memcg = NULL;
 #endif
 
-	p->bts = NULL;
-
-	p->stack_start = stack_start;
-
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 
@@ -1241,21 +1228,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
-	/*
-	 * The task hasn't been attached yet, so its cpus_allowed mask will
-	 * not be changed, nor will its assigned CPU.
-	 *
-	 * The cpus_allowed mask of the parent may have changed after it was
-	 * copied first time - so re-copy it here, then check the child's CPU
-	 * to ensure it is on a valid CPU (and if not, just force it back to
-	 * parent's CPU). This avoids alot of nasty races.
-	 */
-	p->cpus_allowed = current->cpus_allowed;
-	p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
-	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
-			!cpu_online(task_cpu(p))))
-		set_task_cpu(p, smp_processor_id());
-
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
 		p->real_parent = current->real_parent;
@@ -1284,8 +1256,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	}
 
 	if (clone_flags & CLONE_THREAD) {
-		atomic_inc(&current->signal->count);
+		current->signal->nr_threads++;
 		atomic_inc(&current->signal->live);
+		atomic_inc(&current->signal->sigcnt);
 		p->group_leader = current->group_leader;
 		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
 	}
@@ -1298,7 +1271,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 				p->nsproxy->pid_ns->child_reaper = p;
 
 			p->signal->leader_pid = pid;
-			tty_kref_put(p->signal->tty);
 			p->signal->tty = tty_kref_get(current->signal->tty);
 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
 			attach_pid(p, PIDTYPE_SID, task_session(current));
@@ -1331,7 +1303,7 @@ bad_fork_cleanup_mm:
 		mmput(p->mm);
 bad_fork_cleanup_signal:
 	if (!(clone_flags & CLONE_THREAD))
-		__cleanup_signal(p->signal);
+		free_signal_struct(p->signal);
 bad_fork_cleanup_sighand:
 	__cleanup_sighand(p->sighand);
 bad_fork_cleanup_fs:
@@ -1366,6 +1338,16 @@ noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_re
 	return regs;
 }
 
+static inline void init_idle_pids(struct pid_link *links)
+{
+	enum pid_type type;
+
+	for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
+		INIT_HLIST_NODE(&links[type].node); /* not really needed */
+		links[type].pid = &init_struct_pid;
+	}
+}
+
 struct task_struct * __cpuinit fork_idle(int cpu)
 {
 	struct task_struct *task;
@@ -1373,8 +1355,10 @@ struct task_struct * __cpuinit fork_idle(int cpu)
 
 	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
 			    &init_struct_pid, 0);
-	if (!IS_ERR(task))
+	if (!IS_ERR(task)) {
+		init_idle_pids(task->pids);
 		init_idle(task, cpu);
+	}
 
 	return task;
 }
@@ -1546,14 +1530,6 @@ static void check_unshare_flags(unsigned long *flags_ptr)
 		*flags_ptr |= CLONE_SIGHAND;
 
 	/*
-	 * If unsharing signal handlers and the task was created
-	 * using CLONE_THREAD, then must unshare the thread
-	 */
-	if ((*flags_ptr & CLONE_SIGHAND) &&
-	    (atomic_read(&current->signal->count) > 1))
-		*flags_ptr |= CLONE_THREAD;
-
-	/*
 	 * If unsharing namespace, must also unshare filesystem information.
 	 */
 	if (*flags_ptr & CLONE_NEWNS)
@@ -1703,13 +1679,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 
 		if (new_fs) {
 			fs = current->fs;
-			write_lock(&fs->lock);
+			spin_lock(&fs->lock);
 			current->fs = new_fs;
 			if (--fs->users)
 				new_fs = NULL;
 			else
 				new_fs = fs;
-			write_unlock(&fs->lock);
+			spin_unlock(&fs->lock);
 		}
 
 		if (new_mm) {
diff --git a/kernel/futex.c b/kernel/futex.c
index d9b3a2228f9..6a3a5fa1526 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state)
 static struct task_struct * futex_find_get_task(pid_t pid)
 {
 	struct task_struct *p;
-	const struct cred *cred = current_cred(), *pcred;
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
-	if (!p) {
-		p = ERR_PTR(-ESRCH);
-	} else {
-		pcred = __task_cred(p);
-		if (cred->euid != pcred->euid &&
-		    cred->euid != pcred->uid)
-			p = ERR_PTR(-ESRCH);
-		else
-			get_task_struct(p);
-	}
+	if (p)
+		get_task_struct(p);
 
 	rcu_read_unlock();
 
@@ -530,8 +521,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 				return -EINVAL;
 
 			WARN_ON(!atomic_read(&pi_state->refcount));
-			WARN_ON(pid && pi_state->owner &&
-				pi_state->owner->pid != pid);
+
+			/*
+			 * When pi_state->owner is NULL then the owner died
+			 * and another waiter is on the fly. pi_state->owner
+			 * is fixed up by the task which acquires
+			 * pi_state->rt_mutex.
+			 *
+			 * We do not check for pid == 0 which can happen when
+			 * the owner died and robust_list_exit() cleared the
+			 * TID.
+			 */
+			if (pid && pi_state->owner) {
+				/*
+				 * Bail out if user space manipulated the
+				 * futex value.
+				 */
+				if (pid != task_pid_vnr(pi_state->owner))
+					return -EINVAL;
+			}
 
 			atomic_inc(&pi_state->refcount);
 			*ps = pi_state;
@@ -547,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 	if (!pid)
 		return -ESRCH;
 	p = futex_find_get_task(pid);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
+	if (!p)
+		return -ESRCH;
 
 	/*
 	 * We need to look at the task state flags to figure out,
@@ -758,6 +766,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	if (!pi_state)
 		return -EINVAL;
 
+	/*
+	 * If current does not own the pi_state then the futex is
+	 * inconsistent and user space fiddled with the futex value.
+	 */
+	if (pi_state->owner != current)
+		return -EINVAL;
+
 	raw_spin_lock(&pi_state->pi_mutex.wait_lock);
 	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 
@@ -1971,7 +1986,7 @@ retry_private:
 	/* Unqueue and drop the lock */
 	unqueue_me_pi(&q);
 
-	goto out;
+	goto out_put_key;
 
 out_unlock_put_key:
 	queue_unlock(&q, hb);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 235716556bf..d49afb2395e 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 		struct task_struct *p;
 
 		ret = -ESRCH;
-		read_lock(&tasklist_lock);
+		rcu_read_lock();
 		p = find_task_by_vpid(pid);
 		if (!p)
 			goto err_unlock;
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->compat_robust_list;
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 	}
 
 	if (put_user(sizeof(*head), len_ptr))
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 	return put_user(ptr_to_compat(head), head_ptr);
 
 err_unlock:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return ret;
 }
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index ef3c3f88a7a..f83972b1656 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -33,10 +33,11 @@
  * @children: child nodes
  * @all: list head for list of all nodes
  * @parent: parent node
- * @info: associated profiling data structure if not a directory
- * @ghost: when an object file containing profiling data is unloaded we keep a
- *         copy of the profiling data here to allow collecting coverage data
- *         for cleanup code. Such a node is called a "ghost".
+ * @loaded_info: array of pointers to profiling data sets for loaded object
+ *   files.
+ * @num_loaded: number of profiling data sets for loaded object files.
+ * @unloaded_info: accumulated copy of profiling data sets for unloaded
+ *   object files. Used only when gcov_persist=1.
  * @dentry: main debugfs entry, either a directory or data file
  * @links: associated symbolic links
  * @name: data file basename
@@ -51,10 +52,11 @@ struct gcov_node {
 	struct list_head children;
 	struct list_head all;
 	struct gcov_node *parent;
-	struct gcov_info *info;
-	struct gcov_info *ghost;
+	struct gcov_info **loaded_info;
+	struct gcov_info *unloaded_info;
 	struct dentry *dentry;
 	struct dentry **links;
+	int num_loaded;
 	char name[0];
 };
 
@@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = {
 };
 
 /*
- * Return the profiling data set for a given node. This can either be the
- * original profiling data structure or a duplicate (also called "ghost")
- * in case the associated object file has been unloaded.
+ * Return a profiling data set associated with the given node. This is
+ * either a data set for a loaded object file or a data set copy in case
+ * all associated object files have been unloaded.
  */
 static struct gcov_info *get_node_info(struct gcov_node *node)
 {
-	if (node->info)
-		return node->info;
+	if (node->num_loaded > 0)
+		return node->loaded_info[0];
 
-	return node->ghost;
+	return node->unloaded_info;
+}
+
+/*
+ * Return a newly allocated profiling data set which contains the sum of
+ * all profiling data associated with the given node.
+ */
+static struct gcov_info *get_accumulated_info(struct gcov_node *node)
+{
+	struct gcov_info *info;
+	int i = 0;
+
+	if (node->unloaded_info)
+		info = gcov_info_dup(node->unloaded_info);
+	else
+		info = gcov_info_dup(node->loaded_info[i++]);
+	if (!info)
+		return NULL;
+	for (; i < node->num_loaded; i++)
+		gcov_info_add(info, node->loaded_info[i]);
+
+	return info;
 }
 
 /*
@@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file)
 	mutex_lock(&node_lock);
 	/*
 	 * Read from a profiling data copy to minimize reference tracking
-	 * complexity and concurrent access.
+	 * complexity and concurrent access and to keep accumulating multiple
+	 * profiling data sets associated with one node simple.
 	 */
-	info = gcov_info_dup(get_node_info(node));
+	info = get_accumulated_info(node);
 	if (!info)
 		goto out_unlock;
 	iter = gcov_iter_new(info);
@@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name)
 	return NULL;
 }
 
+/*
+ * Reset all profiling data associated with the specified node.
+ */
+static void reset_node(struct gcov_node *node)
+{
+	int i;
+
+	if (node->unloaded_info)
+		gcov_info_reset(node->unloaded_info);
+	for (i = 0; i < node->num_loaded; i++)
+		gcov_info_reset(node->loaded_info[i]);
+}
+
 static void remove_node(struct gcov_node *node);
 
 /*
  * write() implementation for gcov data files. Reset profiling data for the
- * associated file. If the object file has been unloaded (i.e. this is
- * a "ghost" node), remove the debug fs node as well.
+ * corresponding file. If all associated object files have been unloaded,
+ * remove the debug fs node as well.
  */
 static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
 			      size_t len, loff_t *pos)
@@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
 	node = get_node_by_name(info->filename);
 	if (node) {
 		/* Reset counts or remove node for unloaded modules. */
-		if (node->ghost)
+		if (node->num_loaded == 0)
 			remove_node(node);
 		else
-			gcov_info_reset(node->info);
+			reset_node(node);
 	}
 	/* Reset counts for open file. */
 	gcov_info_reset(info);
@@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info,
 	INIT_LIST_HEAD(&node->list);
 	INIT_LIST_HEAD(&node->children);
 	INIT_LIST_HEAD(&node->all);
-	node->info = info;
+	if (node->loaded_info) {
+		node->loaded_info[0] = info;
+		node->num_loaded = 1;
+	}
 	node->parent = parent;
 	if (name)
 		strcpy(node->name, name);
@@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent,
 	struct gcov_node *node;
 
 	node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
-	if (!node) {
-		pr_warning("out of memory\n");
-		return NULL;
+	if (!node)
+		goto err_nomem;
+	if (info) {
+		node->loaded_info = kcalloc(1, sizeof(struct gcov_info *),
+					   GFP_KERNEL);
+		if (!node->loaded_info)
+			goto err_nomem;
 	}
 	init_node(node, info, name, parent);
 	/* Differentiate between gcov data file nodes and directory nodes. */
@@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent,
 	list_add(&node->all, &all_head);
 
 	return node;
+
+err_nomem:
+	kfree(node);
+	pr_warning("out of memory\n");
+	return NULL;
 }
 
 /* Remove symbolic links associated with node. */
@@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node)
 	list_del(&node->all);
 	debugfs_remove(node->dentry);
 	remove_links(node);
-	if (node->ghost)
-		gcov_info_free(node->ghost);
+	kfree(node->loaded_info);
+	if (node->unloaded_info)
+		gcov_info_free(node->unloaded_info);
 	kfree(node);
 }
 
@@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent,
 
 /*
  * write() implementation for reset file. Reset all profiling data to zero
- * and remove ghost nodes.
+ * and remove nodes for which all associated object files are unloaded.
  */
 static ssize_t reset_write(struct file *file, const char __user *addr,
 			   size_t len, loff_t *pos)
@@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr,
 	mutex_lock(&node_lock);
 restart:
 	list_for_each_entry(node, &all_head, all) {
-		if (node->info)
-			gcov_info_reset(node->info);
+		if (node->num_loaded > 0)
+			reset_node(node);
 		else if (list_empty(&node->children)) {
 			remove_node(node);
 			/* Several nodes may have gone - restart loop. */
@@ -564,37 +614,115 @@ err_remove:
 }
 
 /*
- * The profiling data set associated with this node is being unloaded. Store a
- * copy of the profiling data and turn this node into a "ghost".
+ * Associate a profiling data set with an existing node. Needs to be called
+ * with node_lock held.
  */
-static int ghost_node(struct gcov_node *node)
+static void add_info(struct gcov_node *node, struct gcov_info *info)
 {
-	node->ghost = gcov_info_dup(node->info);
-	if (!node->ghost) {
-		pr_warning("could not save data for '%s' (out of memory)\n",
-			   node->info->filename);
-		return -ENOMEM;
+	struct gcov_info **loaded_info;
+	int num = node->num_loaded;
+
+	/*
+	 * Prepare new array. This is done first to simplify cleanup in
+	 * case the new data set is incompatible, the node only contains
+	 * unloaded data sets and there's not enough memory for the array.
+	 */
+	loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
+	if (!loaded_info) {
+		pr_warning("could not add '%s' (out of memory)\n",
+			   info->filename);
+		return;
+	}
+	memcpy(loaded_info, node->loaded_info,
+	       num * sizeof(struct gcov_info *));
+	loaded_info[num] = info;
+	/* Check if the new data set is compatible. */
+	if (num == 0) {
+		/*
+		 * A module was unloaded, modified and reloaded. The new
+		 * data set replaces the copy of the last one.
+		 */
+		if (!gcov_info_is_compatible(node->unloaded_info, info)) {
+			pr_warning("discarding saved data for %s "
+				   "(incompatible version)\n", info->filename);
+			gcov_info_free(node->unloaded_info);
+			node->unloaded_info = NULL;
+		}
+	} else {
+		/*
+		 * Two different versions of the same object file are loaded.
+		 * The initial one takes precedence.
+		 */
+		if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
+			pr_warning("could not add '%s' (incompatible "
+				   "version)\n", info->filename);
+			kfree(loaded_info);
+			return;
+		}
 	}
-	node->info = NULL;
+	/* Overwrite previous array. */
+	kfree(node->loaded_info);
+	node->loaded_info = loaded_info;
+	node->num_loaded = num + 1;
+}
 
-	return 0;
+/*
+ * Return the index of a profiling data set associated with a node.
+ */
+static int get_info_index(struct gcov_node *node, struct gcov_info *info)
+{
+	int i;
+
+	for (i = 0; i < node->num_loaded; i++) {
+		if (node->loaded_info[i] == info)
+			return i;
+	}
+	return -ENOENT;
 }
 
 /*
- * Profiling data for this node has been loaded again. Add profiling data
- * from previous instantiation and turn this node into a regular node.
+ * Save the data of a profiling data set which is being unloaded.
  */
-static void revive_node(struct gcov_node *node, struct gcov_info *info)
+static void save_info(struct gcov_node *node, struct gcov_info *info)
 {
-	if (gcov_info_is_compatible(node->ghost, info))
-		gcov_info_add(info, node->ghost);
+	if (node->unloaded_info)
+		gcov_info_add(node->unloaded_info, info);
 	else {
-		pr_warning("discarding saved data for '%s' (version changed)\n",
+		node->unloaded_info = gcov_info_dup(info);
+		if (!node->unloaded_info) {
+			pr_warning("could not save data for '%s' "
+				   "(out of memory)\n", info->filename);
+		}
+	}
+}
+
+/*
+ * Disassociate a profiling data set from a node. Needs to be called with
+ * node_lock held.
+ */
+static void remove_info(struct gcov_node *node, struct gcov_info *info)
+{
+	int i;
+
+	i = get_info_index(node, info);
+	if (i < 0) {
+		pr_warning("could not remove '%s' (not found)\n",
 			   info->filename);
+		return;
 	}
-	gcov_info_free(node->ghost);
-	node->ghost = NULL;
-	node->info = info;
+	if (gcov_persist)
+		save_info(node, info);
+	/* Shrink array. */
+	node->loaded_info[i] = node->loaded_info[node->num_loaded - 1];
+	node->num_loaded--;
+	if (node->num_loaded > 0)
+		return;
+	/* Last loaded data set was removed. */
+	kfree(node->loaded_info);
+	node->loaded_info = NULL;
+	node->num_loaded = 0;
+	if (!node->unloaded_info)
+		remove_node(node);
 }
 
 /*
@@ -609,30 +737,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
 	node = get_node_by_name(info->filename);
 	switch (action) {
 	case GCOV_ADD:
-		/* Add new node or revive ghost. */
-		if (!node) {
+		if (node)
+			add_info(node, info);
+		else
 			add_node(info);
-			break;
-		}
-		if (gcov_persist)
-			revive_node(node, info);
-		else {
-			pr_warning("could not add '%s' (already exists)\n",
-				   info->filename);
-		}
 		break;
 	case GCOV_REMOVE:
-		/* Remove node or turn into ghost. */
-		if (!node) {
+		if (node)
+			remove_info(node, info);
+		else {
 			pr_warning("could not remove '%s' (not found)\n",
 				   info->filename);
-			break;
 		}
-		if (gcov_persist) {
-			if (!ghost_node(node))
-				break;
-		}
-		remove_node(node);
 		break;
 	}
 	mutex_unlock(&node_lock);
diff --git a/kernel/groups.c b/kernel/groups.c
index 2b45b2ee396..253dc0f35cf 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
 	right = group_info->ngroups;
 	while (left < right) {
 		unsigned int mid = (left+right)/2;
-		int cmp = grp - GROUP_AT(group_info, mid);
-		if (cmp > 0)
+		if (grp > GROUP_AT(group_info, mid))
 			left = mid + 1;
-		else if (cmp < 0)
+		else if (grp < GROUP_AT(group_info, mid))
 			right = mid;
 		else
 			return 1;
@@ -164,12 +163,6 @@ int groups_search(const struct group_info *group_info, gid_t grp)
  */
 int set_groups(struct cred *new, struct group_info *group_info)
 {
-	int retval;
-
-	retval = security_task_setgroups(group_info);
-	if (retval)
-		return retval;
-
 	put_group_info(new->group_info);
 	groups_sort(group_info);
 	get_group_info(group_info);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0086628b6e9..72206cf5c6c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -89,8 +89,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 
 	do {
 		seq = read_seqbegin(&xtime_lock);
-		xts = current_kernel_time();
-		tom = wall_to_monotonic;
+		xts = __current_kernel_time();
+		tom = __get_wall_to_monotonic();
 	} while (read_seqretry(&xtime_lock, seq));
 
 	xtim = timespec_to_ktime(xts);
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 static int hrtimer_get_target(int this_cpu, int pinned)
 {
 #ifdef CONFIG_NO_HZ
-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
-		int preferred_cpu = get_nohz_load_balancer();
-
-		if (preferred_cpu >= 0)
-			return preferred_cpu;
-	}
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
+		return get_nohz_timer_target();
 #endif
 	return this_cpu;
 }
@@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 static void retrigger_next_event(void *arg)
 {
 	struct hrtimer_cpu_base *base;
-	struct timespec realtime_offset;
+	struct timespec realtime_offset, wtm;
 	unsigned long seq;
 
 	if (!hrtimer_hres_active())
@@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg)
 
 	do {
 		seq = read_seqbegin(&xtime_lock);
-		set_normalized_timespec(&realtime_offset,
-					-wall_to_monotonic.tv_sec,
-					-wall_to_monotonic.tv_nsec);
+		wtm = __get_wall_to_monotonic();
 	} while (read_seqretry(&xtime_lock, seq));
+	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
 
 	base = &__get_cpu_var(hrtimer_bases);
 
@@ -936,6 +931,7 @@ static inline int
 remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 {
 	if (hrtimer_is_queued(timer)) {
+		unsigned long state;
 		int reprogram;
 
 		/*
@@ -949,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
-		__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
-				 reprogram);
+		/*
+		 * We must preserve the CALLBACK state flag here,
+		 * otherwise we could move the timer base in
+		 * switch_hrtimer_base.
+		 */
+		state = timer->state & HRTIMER_STATE_CALLBACK;
+		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
 	return 0;
@@ -1096,11 +1097,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
  */
 ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 {
-	struct hrtimer_clock_base *base;
 	unsigned long flags;
 	ktime_t rem;
 
-	base = lock_hrtimer_base(timer, &flags);
+	lock_hrtimer_base(timer, &flags);
 	rem = hrtimer_expires_remaining(timer);
 	unlock_hrtimer_base(timer, &flags);
 
@@ -1237,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
 		enqueue_hrtimer(timer, base);
 	}
+
+	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 
@@ -1749,35 +1752,15 @@ void __init hrtimers_init(void)
 }
 
 /**
- * schedule_hrtimeout_range - sleep until timeout
+ * schedule_hrtimeout_range_clock - sleep until timeout
  * @expires:	timeout value (ktime_t)
  * @delta:	slack in expires timeout (ktime_t)
  * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
- *
- * Make the current task sleep until the given expiry time has
- * elapsed. The routine will return immediately unless
- * the current task state has been set (see set_current_state()).
- *
- * The @delta argument gives the kernel the freedom to schedule the
- * actual wakeup to a time that is both power and performance friendly.
- * The kernel give the normal best effort behavior for "@expires+@delta",
- * but may decide to fire the timer earlier, but no earlier than @expires.
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
- * pass before the routine returns.
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task.
- *
- * The current task state is guaranteed to be TASK_RUNNING when this
- * routine returns.
- *
- * Returns 0 when the timer has expired otherwise -EINTR
+ * @clock:	timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
  */
-int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
-			       const enum hrtimer_mode mode)
+int __sched
+schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
+			       const enum hrtimer_mode mode, int clock)
 {
 	struct hrtimer_sleeper t;
 
@@ -1799,7 +1782,7 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
 		return -EINTR;
 	}
 
-	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
+	hrtimer_init_on_stack(&t.timer, clock, mode);
 	hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
 
 	hrtimer_init_sleeper(&t, current);
@@ -1818,6 +1801,41 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
 
 	return !t.task ? 0 : -EINTR;
 }
+
+/**
+ * schedule_hrtimeout_range - sleep until timeout
+ * @expires:	timeout value (ktime_t)
+ * @delta:	slack in expires timeout (ktime_t)
+ * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * The @delta argument gives the kernel the freedom to schedule the
+ * actual wakeup to a time that is both power and performance friendly.
+ * The kernel give the normal best effort behavior for "@expires+@delta",
+ * but may decide to fire the timer earlier, but no earlier than @expires.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+				     const enum hrtimer_mode mode)
+{
+	return schedule_hrtimeout_range_clock(expires, delta, mode,
+					      CLOCK_MONOTONIC);
+}
 EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
 
 /**
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 50dbd599958..c7c2aed9e2d 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,23 +40,33 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 
 #include <linux/hw_breakpoint.h>
 
+
 /*
  * Constraints data
  */
 
 /* Number of pinned cpu breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
 
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
+static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
 
 /* Number of non-pinned cpu/task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
+
+static int nr_slots[TYPE_MAX];
+
+/* Keep track of the breakpoints attached to tasks */
+static LIST_HEAD(bp_task_head);
+
+static int constraints_initialized;
 
 /* Gather the number of total pinned and un-pinned bp in a cpuset */
 struct bp_busy_slots {
@@ -67,16 +77,29 @@ struct bp_busy_slots {
 /* Serialize accesses to the above constraints */
 static DEFINE_MUTEX(nr_bp_mutex);
 
+__weak int hw_breakpoint_weight(struct perf_event *bp)
+{
+	return 1;
+}
+
+static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
+{
+	if (bp->attr.bp_type & HW_BREAKPOINT_RW)
+		return TYPE_DATA;
+
+	return TYPE_INST;
+}
+
 /*
  * Report the maximum number of pinned breakpoints a task
  * have in this cpu
  */
-static unsigned int max_task_bp_pinned(int cpu)
+static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 {
 	int i;
-	unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+	unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 
-	for (i = HBP_NUM -1; i >= 0; i--) {
+	for (i = nr_slots[type] - 1; i >= 0; i--) {
 		if (tsk_pinned[i] > 0)
 			return i + 1;
 	}
@@ -84,32 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu)
 	return 0;
 }
 
-static int task_bp_pinned(struct task_struct *tsk)
+/*
+ * Count the number of breakpoints of the same type and same task.
+ * The given event must be not on the list.
+ */
+static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
 {
-	struct perf_event_context *ctx = tsk->perf_event_ctxp;
-	struct list_head *list;
-	struct perf_event *bp;
-	unsigned long flags;
+	struct perf_event_context *ctx = bp->ctx;
+	struct perf_event *iter;
 	int count = 0;
 
-	if (WARN_ONCE(!ctx, "No perf context for this task"))
-		return 0;
-
-	list = &ctx->event_list;
-
-	raw_spin_lock_irqsave(&ctx->lock, flags);
-
-	/*
-	 * The current breakpoint counter is not included in the list
-	 * at the open() callback time
-	 */
-	list_for_each_entry(bp, list, event_entry) {
-		if (bp->attr.type == PERF_TYPE_BREAKPOINT)
-			count++;
+	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
+		if (iter->ctx == ctx && find_slot_idx(iter) == type)
+			count += hw_breakpoint_weight(iter);
 	}
 
-	raw_spin_unlock_irqrestore(&ctx->lock, flags);
-
 	return count;
 }
 
@@ -118,18 +130,19 @@ static int task_bp_pinned(struct task_struct *tsk)
  * a given cpu (cpu > -1) or in all of them (cpu = -1).
  */
 static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
+		    enum bp_type_idx type)
 {
 	int cpu = bp->cpu;
 	struct task_struct *tsk = bp->ctx->task;
 
 	if (cpu >= 0) {
-		slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+		slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
 		if (!tsk)
-			slots->pinned += max_task_bp_pinned(cpu);
+			slots->pinned += max_task_bp_pinned(cpu, type);
 		else
-			slots->pinned += task_bp_pinned(tsk);
-		slots->flexible = per_cpu(nr_bp_flexible, cpu);
+			slots->pinned += task_bp_pinned(bp, type);
+		slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
 
 		return;
 	}
@@ -137,16 +150,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 	for_each_online_cpu(cpu) {
 		unsigned int nr;
 
-		nr = per_cpu(nr_cpu_bp_pinned, cpu);
+		nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
 		if (!tsk)
-			nr += max_task_bp_pinned(cpu);
+			nr += max_task_bp_pinned(cpu, type);
 		else
-			nr += task_bp_pinned(tsk);
+			nr += task_bp_pinned(bp, type);
 
 		if (nr > slots->pinned)
 			slots->pinned = nr;
 
-		nr = per_cpu(nr_bp_flexible, cpu);
+		nr = per_cpu(nr_bp_flexible[type], cpu);
 
 		if (nr > slots->flexible)
 			slots->flexible = nr;
@@ -154,52 +167,89 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 }
 
 /*
+ * For now, continue to consider flexible as pinned, until we can
+ * ensure no flexible event can ever be scheduled before a pinned event
+ * in a same cpu.
+ */
+static void
+fetch_this_slot(struct bp_busy_slots *slots, int weight)
+{
+	slots->pinned += weight;
+}
+
+/*
  * Add a pinned breakpoint for the given task in our constraint table
  */
-static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
+				enum bp_type_idx type, int weight)
 {
 	unsigned int *tsk_pinned;
-	int count = 0;
+	int old_count = 0;
+	int old_idx = 0;
+	int idx = 0;
 
-	count = task_bp_pinned(tsk);
+	old_count = task_bp_pinned(bp, type);
+	old_idx = old_count - 1;
+	idx = old_idx + weight;
 
-	tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+	/* tsk_pinned[n] is the number of tasks having n breakpoints */
+	tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 	if (enable) {
-		tsk_pinned[count]++;
-		if (count > 0)
-			tsk_pinned[count-1]--;
+		tsk_pinned[idx]++;
+		if (old_count > 0)
+			tsk_pinned[old_idx]--;
 	} else {
-		tsk_pinned[count]--;
-		if (count > 0)
-			tsk_pinned[count-1]++;
+		tsk_pinned[idx]--;
+		if (old_count > 0)
+			tsk_pinned[old_idx]++;
 	}
 }
 
 /*
  * Add/remove the given breakpoint in our constraint table
  */
-static void toggle_bp_slot(struct perf_event *bp, bool enable)
+static void
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
+	       int weight)
 {
 	int cpu = bp->cpu;
 	struct task_struct *tsk = bp->ctx->task;
 
+	/* Pinned counter cpu profiling */
+	if (!tsk) {
+
+		if (enable)
+			per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
+		else
+			per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
+		return;
+	}
+
 	/* Pinned counter task profiling */
-	if (tsk) {
-		if (cpu >= 0) {
-			toggle_bp_task_slot(tsk, cpu, enable);
-			return;
-		}
 
+	if (!enable)
+		list_del(&bp->hw.bp_list);
+
+	if (cpu >= 0) {
+		toggle_bp_task_slot(bp, cpu, enable, type, weight);
+	} else {
 		for_each_online_cpu(cpu)
-			toggle_bp_task_slot(tsk, cpu, enable);
-		return;
+			toggle_bp_task_slot(bp, cpu, enable, type, weight);
 	}
 
-	/* Pinned counter cpu profiling */
 	if (enable)
-		per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
-	else
-		per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
+		list_add_tail(&bp->hw.bp_list, &bp_task_head);
+}
+
+/*
+ * Function to perform processor-specific cleanup during unregistration
+ */
+__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
+{
+	/*
+	 * A weak stub function here for those archs that don't define
+	 * it inside arch/.../kernel/hw_breakpoint.c
+	 */
 }
 
 /*
@@ -243,38 +293,117 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
  *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
  *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
  */
-int reserve_bp_slot(struct perf_event *bp)
+static int __reserve_bp_slot(struct perf_event *bp)
 {
 	struct bp_busy_slots slots = {0};
-	int ret = 0;
+	enum bp_type_idx type;
+	int weight;
 
-	mutex_lock(&nr_bp_mutex);
+	/* We couldn't initialize breakpoint constraints on boot */
+	if (!constraints_initialized)
+		return -ENOMEM;
 
-	fetch_bp_busy_slots(&slots, bp);
+	/* Basic checks */
+	if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
+	    bp->attr.bp_type == HW_BREAKPOINT_INVALID)
+		return -EINVAL;
+
+	type = find_slot_idx(bp);
+	weight = hw_breakpoint_weight(bp);
+
+	fetch_bp_busy_slots(&slots, bp, type);
+	/*
+	 * Simulate the addition of this breakpoint to the constraints
+	 * and see the result.
+	 */
+	fetch_this_slot(&slots, weight);
 
 	/* Flexible counters need to keep at least one slot */
-	if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
-		ret = -ENOSPC;
-		goto end;
-	}
+	if (slots.pinned + (!!slots.flexible) > nr_slots[type])
+		return -ENOSPC;
 
-	toggle_bp_slot(bp, true);
+	toggle_bp_slot(bp, true, type, weight);
+
+	return 0;
+}
+
+int reserve_bp_slot(struct perf_event *bp)
+{
+	int ret;
+
+	mutex_lock(&nr_bp_mutex);
+
+	ret = __reserve_bp_slot(bp);
 
-end:
 	mutex_unlock(&nr_bp_mutex);
 
 	return ret;
 }
 
+static void __release_bp_slot(struct perf_event *bp)
+{
+	enum bp_type_idx type;
+	int weight;
+
+	type = find_slot_idx(bp);
+	weight = hw_breakpoint_weight(bp);
+	toggle_bp_slot(bp, false, type, weight);
+}
+
 void release_bp_slot(struct perf_event *bp)
 {
 	mutex_lock(&nr_bp_mutex);
 
-	toggle_bp_slot(bp, false);
+	arch_unregister_hw_breakpoint(bp);
+	__release_bp_slot(bp);
 
 	mutex_unlock(&nr_bp_mutex);
 }
 
+/*
+ * Allow the kernel debugger to reserve breakpoint slots without
+ * taking a lock using the dbg_* variant of for the reserve and
+ * release breakpoint slots.
+ */
+int dbg_reserve_bp_slot(struct perf_event *bp)
+{
+	if (mutex_is_locked(&nr_bp_mutex))
+		return -1;
+
+	return __reserve_bp_slot(bp);
+}
+
+int dbg_release_bp_slot(struct perf_event *bp)
+{
+	if (mutex_is_locked(&nr_bp_mutex))
+		return -1;
+
+	__release_bp_slot(bp);
+
+	return 0;
+}
+
+static int validate_hw_breakpoint(struct perf_event *bp)
+{
+	int ret;
+
+	ret = arch_validate_hwbkpt_settings(bp);
+	if (ret)
+		return ret;
+
+	if (arch_check_bp_in_kernelspace(bp)) {
+		if (bp->attr.exclude_kernel)
+			return -EINVAL;
+		/*
+		 * Don't let unprivileged users set a breakpoint in the trap
+		 * path to avoid trap recursion attacks.
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+	}
+
+	return 0;
+}
 
 int register_perf_hw_breakpoint(struct perf_event *bp)
 {
@@ -284,17 +413,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 	if (ret)
 		return ret;
 
-	/*
-	 * Ptrace breakpoints can be temporary perf events only
-	 * meant to reserve a slot. In this case, it is created disabled and
-	 * we don't want to check the params right now (as we put a null addr)
-	 * But perf tools create events as disabled and we want to check
-	 * the params for them.
-	 * This is a quick hack that will be removed soon, once we remove
-	 * the tmp breakpoints from ptrace
-	 */
-	if (!bp->attr.disabled || !bp->overflow_handler)
-		ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+	ret = validate_hw_breakpoint(bp);
+
+	/* if arch_validate_hwbkpt_settings() fails then release bp slot */
+	if (ret)
+		release_bp_slot(bp);
 
 	return ret;
 }
@@ -310,7 +433,8 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_overflow_handler_t triggered,
 			    struct task_struct *tsk)
 {
-	return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+	return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk),
+						triggered);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
@@ -324,8 +448,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
 {
 	u64 old_addr = bp->attr.bp_addr;
+	u64 old_len = bp->attr.bp_len;
 	int old_type = bp->attr.bp_type;
-	int old_len = bp->attr.bp_len;
 	int err = 0;
 
 	perf_event_disable(bp);
@@ -337,7 +461,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 	if (attr->disabled)
 		goto end;
 
-	err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+	err = validate_hw_breakpoint(bp);
 	if (!err)
 		perf_event_enable(bp);
 
@@ -377,17 +501,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
  *
  * @return a set of per_cpu pointers to perf events
  */
-struct perf_event **
+struct perf_event * __percpu *
 register_wide_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_overflow_handler_t triggered)
 {
-	struct perf_event **cpu_events, **pevent, *bp;
+	struct perf_event * __percpu *cpu_events, **pevent, *bp;
 	long err;
 	int cpu;
 
 	cpu_events = alloc_percpu(typeof(*cpu_events));
 	if (!cpu_events)
-		return ERR_PTR(-ENOMEM);
+		return (void __percpu __force *)ERR_PTR(-ENOMEM);
 
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
@@ -415,7 +539,7 @@ fail:
 	put_online_cpus();
 
 	free_percpu(cpu_events);
-	return ERR_PTR(err);
+	return (void __percpu __force *)ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
 
@@ -423,7 +547,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
  * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
  * @cpu_events: the per cpu set of events to unregister
  */
-void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
+void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
 {
 	int cpu;
 	struct perf_event **pevent;
@@ -444,7 +568,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
 
 static int __init init_hw_breakpoint(void)
 {
+	unsigned int **task_bp_pinned;
+	int cpu, err_cpu;
+	int i;
+
+	for (i = 0; i < TYPE_MAX; i++)
+		nr_slots[i] = hw_breakpoint_slots(i);
+
+	for_each_possible_cpu(cpu) {
+		for (i = 0; i < TYPE_MAX; i++) {
+			task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
+			*task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
+						  GFP_KERNEL);
+			if (!*task_bp_pinned)
+				goto err_alloc;
+		}
+	}
+
+	constraints_initialized = 1;
+
 	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+
+ err_alloc:
+	for_each_possible_cpu(err_cpu) {
+		if (err_cpu == cpu)
+			break;
+		for (i = 0; i < TYPE_MAX; i++)
+			kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+	}
+
+	return -ENOMEM;
 }
 core_initcall(init_hw_breakpoint);
 
@@ -453,5 +606,4 @@ struct pmu perf_ops_bp = {
 	.enable		= arch_install_hw_breakpoint,
 	.disable	= arch_uninstall_hw_breakpoint,
 	.read		= hw_breakpoint_pmu_read,
-	.unthrottle	= hw_breakpoint_pmu_unthrottle
 };
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa28f66..b7091d5ca2f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
 
 #include "internals.h"
 
-/**
- *	dynamic_irq_init - initialize a dynamically allocated irq
- *	@irq:	irq number to initialize
- */
-void dynamic_irq_init(unsigned int irq)
+static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
 {
 	struct irq_desc *desc;
 	unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
 	desc->depth = 1;
 	desc->msi_desc = NULL;
 	desc->handler_data = NULL;
-	desc->chip_data = NULL;
+	if (!keep_chip_data)
+		desc->chip_data = NULL;
 	desc->action = NULL;
 	desc->irq_count = 0;
 	desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
 }
 
 /**
- *	dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *	dynamic_irq_init - initialize a dynamically allocated irq
  *	@irq:	irq number to initialize
  */
-void dynamic_irq_cleanup(unsigned int irq)
+void dynamic_irq_init(unsigned int irq)
+{
+	dynamic_irq_init_x(irq, false);
+}
+
+/**
+ *	dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
+ *	@irq:	irq number to initialize
+ *
+ *	does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_init_keep_chip_data(unsigned int irq)
+{
+	dynamic_irq_init_x(irq, true);
+}
+
+static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq)
 	}
 	desc->msi_desc = NULL;
 	desc->handler_data = NULL;
-	desc->chip_data = NULL;
+	if (!keep_chip_data)
+		desc->chip_data = NULL;
 	desc->handle_irq = handle_bad_irq;
 	desc->chip = &no_irq_chip;
 	desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq)
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 
+/**
+ *	dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *	@irq:	irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+	dynamic_irq_cleanup_x(irq, false);
+}
+
+/**
+ *	dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
+ *	@irq:	irq number to initialize
+ *
+ *	does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
+{
+	dynamic_irq_cleanup_x(irq, true);
+}
+
 
 /**
  *	set_irq_chip - set the irq chip for an irq
@@ -325,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
 		if (desc->chip->ack)
 			desc->chip->ack(irq);
 	}
+	desc->status |= IRQ_MASKED;
+}
+
+static inline void mask_irq(struct irq_desc *desc, int irq)
+{
+	if (desc->chip->mask) {
+		desc->chip->mask(irq);
+		desc->status |= IRQ_MASKED;
+	}
+}
+
+static inline void unmask_irq(struct irq_desc *desc, int irq)
+{
+	if (desc->chip->unmask) {
+		desc->chip->unmask(irq);
+		desc->status &= ~IRQ_MASKED;
+	}
 }
 
 /*
@@ -450,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	raw_spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 
-	if (unlikely(desc->status & IRQ_ONESHOT))
-		desc->status |= IRQ_MASKED;
-	else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
-		desc->chip->unmask(irq);
+	if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
+		unmask_irq(desc, irq);
 out_unlock:
 	raw_spin_unlock(&desc->lock);
 }
@@ -490,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	action = desc->action;
 	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
 		desc->status |= IRQ_PENDING;
-		if (desc->chip->mask)
-			desc->chip->mask(irq);
+		mask_irq(desc, irq);
 		goto out;
 	}
 
@@ -520,7 +568,7 @@ out:
  *	signal. The occurence is latched into the irq controller hardware
  *	and must be acked in order to be reenabled. After the ack another
  *	interrupt can happen on the same source even before the first one
- *	is handled by the assosiacted event handler. If this happens it
+ *	is handled by the associated event handler. If this happens it
  *	might be necessary to disable (mask) the interrupt depending on the
  *	controller hardware. This requires to reenable the interrupt inside
  *	of the loop which handles the interrupts which have arrived while
@@ -559,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		irqreturn_t action_ret;
 
 		if (unlikely(!action)) {
-			desc->chip->mask(irq);
+			mask_irq(desc, irq);
 			goto out_unlock;
 		}
 
@@ -571,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		if (unlikely((desc->status &
 			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
 			      (IRQ_PENDING | IRQ_MASKED))) {
-			desc->chip->unmask(irq);
-			desc->status &= ~IRQ_MASKED;
+			unmask_irq(desc, irq);
 		}
 
 		desc->status &= ~IRQ_PENDING;
@@ -682,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 	__set_irq_handler(irq, handle, 0, name);
 }
 
-void __init set_irq_noprobe(unsigned int irq)
+void set_irq_noprobe(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -697,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq)
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 
-void __init set_irq_probe(unsigned int irq)
+void set_irq_probe(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cb..1ef4ffcdfa5 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
  *	automatically freed on driver detach.
  *
  *	If an IRQ allocated with this function needs to be freed
- *	separately, dev_free_irq() must be used.
+ *	separately, devm_free_irq() must be used.
  */
 int devm_request_threaded_irq(struct device *dev, unsigned int irq,
 			      irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
  *	Except for the extra @dev argument, this function takes the
  *	same arguments and performs the same function as free_irq().
  *	This function instead of free_irq() should be used to manually
- *	free IRQs allocated with dev_request_irq().
+ *	free IRQs allocated with devm_request_irq().
  */
 void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
 {
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 814940e7f48..27e5c691122 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
-#include <linux/bootmem.h>
+#include <linux/radix-tree.h>
 #include <trace/events/irq.h>
 
 #include "internals.h"
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
 	void *ptr;
 
-	if (slab_is_available())
-		ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
-				   GFP_ATOMIC, node);
-	else
-		ptr = alloc_bootmem_node(NODE_DATA(node),
-				nr * sizeof(*desc->kstat_irqs));
+	ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+			   GFP_ATOMIC, node);
 
 	/*
 	 * don't overwite if can not get new one
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
  */
 DEFINE_RAW_SPINLOCK(sparse_irq_lock);
 
-struct irq_desc **irq_desc_ptrs __read_mostly;
+static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
+
+static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+	radix_tree_insert(&irq_desc_tree, irq, desc);
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	return radix_tree_lookup(&irq_desc_tree, irq);
+}
+
+void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+	void **ptr;
+
+	ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
+	if (ptr)
+		radix_tree_replace_slot(ptr, desc);
+}
 
 static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS_LEGACY-1] = {
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
 	node = first_online_node;
 
-	/* allocate irq_desc_ptrs array based on nr_irqs */
-	irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
-
 	/* allocate based on nr_cpu_ids */
 	kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
 					  sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
 		alloc_desc_masks(&desc[i], node, true);
 		init_desc_masks(&desc[i]);
-		irq_desc_ptrs[i] = desc + i;
+		set_irq_desc(i, &desc[i]);
 	}
 
-	for (i = legacy_count; i < nr_irqs; i++)
-		irq_desc_ptrs[i] = NULL;
-
 	return arch_early_irq_init();
 }
 
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-	if (irq_desc_ptrs && irq < nr_irqs)
-		return irq_desc_ptrs[irq];
-
-	return NULL;
-}
-
 struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 {
 	struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 		return NULL;
 	}
 
-	desc = irq_desc_ptrs[irq];
+	desc = irq_to_desc(irq);
 	if (desc)
 		return desc;
 
 	raw_spin_lock_irqsave(&sparse_irq_lock, flags);
 
 	/* We have to check it to avoid races with another CPU */
-	desc = irq_desc_ptrs[irq];
+	desc = irq_to_desc(irq);
 	if (desc)
 		goto out_unlock;
 
-	if (slab_is_available())
-		desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-	else
-		desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
 
 	printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
 	if (!desc) {
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 	}
 	init_one_irq_desc(irq, desc, node);
 
-	irq_desc_ptrs[irq] = desc;
+	set_irq_desc(irq, desc);
 
 out_unlock:
 	raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
@@ -372,9 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 	irqreturn_t ret, retval = IRQ_NONE;
 	unsigned int status = 0;
 
-	if (!(action->flags & IRQF_DISABLED))
-		local_irq_enable_in_hardirq();
-
 	do {
 		trace_irq_handler_entry(irq, action);
 		ret = action->handler(irq, action->dev_id);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2821f070a3..c63f3bc88f0 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc);
 extern raw_spinlock_t sparse_irq_lock;
 
 #ifdef CONFIG_SPARSE_IRQ
-/* irq_desc_ptrs allocated at boot time */
-extern struct irq_desc **irq_desc_ptrs;
-#else
-/* irq_desc_ptrs is a fixed size array */
-extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
+void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
 #endif
 
 #ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eb6078ca60c..c3003e9d91a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	return 0;
 }
 
+int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	unsigned long flags;
+
+	if (!desc)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	desc->affinity_hint = m;
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
  * Generic version of the affinity autoselector.
@@ -200,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
 void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 {
 	if (suspend) {
-		if (!desc->action || (desc->action->flags & IRQF_TIMER))
+		if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
 			return;
 		desc->status |= IRQ_SUSPENDED;
 	}
@@ -382,6 +398,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action;
+	unsigned long flags;
 
 	if (!desc)
 		return 0;
@@ -389,11 +406,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 	if (desc->status & IRQ_NOREQUEST)
 		return 0;
 
+	raw_spin_lock_irqsave(&desc->lock, flags);
 	action = desc->action;
 	if (action)
 		if (irqflags & action->flags & IRQF_SHARED)
 			action = NULL;
 
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
 	return !action;
 }
 
@@ -436,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		/* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
 		desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
 		desc->status |= flags;
+
+		if (chip != desc->chip)
+			irq_chip_set_defaults(desc->chip);
 	}
 
 	return ret;
@@ -483,8 +506,26 @@ static int irq_wait_for_interrupt(struct irqaction *action)
  */
 static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
 {
+again:
 	chip_bus_lock(irq, desc);
 	raw_spin_lock_irq(&desc->lock);
+
+	/*
+	 * Implausible though it may be we need to protect us against
+	 * the following scenario:
+	 *
+	 * The thread is faster done than the hard interrupt handler
+	 * on the other CPU. If we unmask the irq line then the
+	 * interrupt can come in again and masks the line, leaves due
+	 * to IRQ_INPROGRESS and the irq line is masked forever.
+	 */
+	if (unlikely(desc->status & IRQ_INPROGRESS)) {
+		raw_spin_unlock_irq(&desc->lock);
+		chip_bus_sync_unlock(irq, desc);
+		cpu_relax();
+		goto again;
+	}
+
 	if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
 		desc->status &= ~IRQ_MASKED;
 		desc->chip->unmask(irq);
@@ -884,6 +925,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 			desc->chip->disable(irq);
 	}
 
+#ifdef CONFIG_SMP
+	/* make sure affinity_hint is cleaned up */
+	if (WARN_ON_ONCE(desc->affinity_hint))
+		desc->affinity_hint = NULL;
+#endif
+
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 	unregister_handler_proc(irq, action);
@@ -995,7 +1042,6 @@ EXPORT_SYMBOL(free_irq);
  *	Flags:
  *
  *	IRQF_SHARED		Interrupt is shared
- *	IRQF_DISABLED	Disable local interrupts while processing
  *	IRQF_SAMPLE_RANDOM	The interrupt can be used for entropy
  *	IRQF_TRIGGER_*		Specify active edge(s) or level
  *
@@ -1009,25 +1055,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	int retval;
 
 	/*
-	 * handle_IRQ_event() always ignores IRQF_DISABLED except for
-	 * the _first_ irqaction (sigh).  That can cause oopsing, but
-	 * the behavior is classified as "will not fix" so we need to
-	 * start nudging drivers away from using that idiom.
-	 */
-	if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
-					(IRQF_SHARED|IRQF_DISABLED)) {
-		pr_warning(
-		  "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
-			irq, devname);
-	}
-
-#ifdef CONFIG_LOCKDEP
-	/*
-	 * Lockdep wants atomic interrupt handlers:
-	 */
-	irqflags |= IRQF_DISABLED;
-#endif
-	/*
 	 * Sanity-check: shared interrupts must pass in a real dev-ID,
 	 * otherwise we'll have trouble later trying to figure out
 	 * which interrupt is which (messes up the interrupt freeing
@@ -1088,3 +1115,40 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	return retval;
 }
 EXPORT_SYMBOL(request_threaded_irq);
+
+/**
+ *	request_any_context_irq - allocate an interrupt line
+ *	@irq: Interrupt line to allocate
+ *	@handler: Function to be called when the IRQ occurs.
+ *		  Threaded handler for threaded interrupts.
+ *	@flags: Interrupt type flags
+ *	@name: An ascii name for the claiming device
+ *	@dev_id: A cookie passed back to the handler function
+ *
+ *	This call allocates interrupt resources and enables the
+ *	interrupt line and IRQ handling. It selects either a
+ *	hardirq or threaded handling method depending on the
+ *	context.
+ *
+ *	On failure, it returns a negative value. On success,
+ *	it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
+ */
+int request_any_context_irq(unsigned int irq, irq_handler_t handler,
+			    unsigned long flags, const char *name, void *dev_id)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	int ret;
+
+	if (!desc)
+		return -EINVAL;
+
+	if (desc->status & IRQ_NESTED_THREAD) {
+		ret = request_threaded_irq(irq, NULL, handler,
+					   flags, name, dev_id);
+		return !ret ? IRQC_IS_NESTED : ret;
+	}
+
+	ret = request_irq(irq, handler, flags, name, dev_id);
+	return !ret ? IRQC_IS_HARDIRQ : ret;
+}
+EXPORT_SYMBOL_GPL(request_any_context_irq);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 26bac9d8f86..65d3845665a 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
@@ -70,7 +71,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 	raw_spin_lock_irqsave(&sparse_irq_lock, flags);
 
 	/* We have to check it to avoid races with another CPU */
-	desc = irq_desc_ptrs[irq];
+	desc = irq_to_desc(irq);
 
 	if (desc && old_desc != desc)
 		goto out_unlock;
@@ -90,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 		goto out_unlock;
 	}
 
-	irq_desc_ptrs[irq] = desc;
+	replace_irq_desc(irq, desc);
 	raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
 
 	/* free the old one */
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6f50eccc79c..09a2ee540bd 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/gfp.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
@@ -31,6 +32,27 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
+{
+	struct irq_desc *desc = irq_to_desc((long)m->private);
+	unsigned long flags;
+	cpumask_var_t mask;
+
+	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	if (desc->affinity_hint)
+		cpumask_copy(mask, desc->affinity_hint);
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	seq_cpumask(m, mask);
+	seq_putc(m, '\n');
+	free_cpumask_var(mask);
+
+	return 0;
+}
+
 #ifndef is_affinity_mask_valid
 #define is_affinity_mask_valid(val) 1
 #endif
@@ -83,6 +105,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file)
 	return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
 }
 
+static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
+}
+
 static const struct file_operations irq_affinity_proc_fops = {
 	.open		= irq_affinity_proc_open,
 	.read		= seq_read,
@@ -91,6 +118,13 @@ static const struct file_operations irq_affinity_proc_fops = {
 	.write		= irq_affinity_proc_write,
 };
 
+static const struct file_operations irq_affinity_hint_proc_fops = {
+	.open		= irq_affinity_hint_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int default_affinity_show(struct seq_file *m, void *v)
 {
 	seq_cpumask(m, irq_default_affinity);
@@ -146,6 +180,26 @@ static const struct file_operations default_affinity_proc_fops = {
 	.release	= single_release,
 	.write		= default_affinity_write,
 };
+
+static int irq_node_proc_show(struct seq_file *m, void *v)
+{
+	struct irq_desc *desc = irq_to_desc((long) m->private);
+
+	seq_printf(m, "%d\n", desc->node);
+	return 0;
+}
+
+static int irq_node_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_node_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations irq_node_proc_fops = {
+	.open		= irq_node_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif
 
 static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -230,6 +284,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 	/* create /proc/irq/<irq>/smp_affinity */
 	proc_create_data("smp_affinity", 0600, desc->dir,
 			 &irq_affinity_proc_fops, (void *)(long)irq);
+
+	/* create /proc/irq/<irq>/affinity_hint */
+	proc_create_data("affinity_hint", 0400, desc->dir,
+			 &irq_affinity_hint_proc_fops, (void *)(long)irq);
+
+	proc_create_data("node", 0444, desc->dir,
+			 &irq_node_proc_fops, (void *)(long)irq);
 #endif
 
 	proc_create_data("spurious", 0444, desc->dir,
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8e5288a8a35..6f6d091b575 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -16,11 +16,13 @@
 #include <linux/init.h>
 #include <linux/seq_file.h>
 #include <linux/fs.h>
+#include <linux/kdb.h>
 #include <linux/err.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>	/* for cond_resched */
 #include <linux/mm.h>
 #include <linux/ctype.h>
+#include <linux/slab.h>
 
 #include <asm/sections.h>
 
@@ -515,6 +517,26 @@ static int kallsyms_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+#ifdef	CONFIG_KGDB_KDB
+const char *kdb_walk_kallsyms(loff_t *pos)
+{
+	static struct kallsym_iter kdb_walk_kallsyms_iter;
+	if (*pos == 0) {
+		memset(&kdb_walk_kallsyms_iter, 0,
+		       sizeof(kdb_walk_kallsyms_iter));
+		reset_iter(&kdb_walk_kallsyms_iter, 0);
+	}
+	while (1) {
+		if (!update_iter(&kdb_walk_kallsyms_iter, *pos))
+			return NULL;
+		++*pos;
+		/* Some debugging symbols have no name.  Ignore them. */
+		if (kdb_walk_kallsyms_iter.name[0])
+			return kdb_walk_kallsyms_iter.name;
+	}
+}
+#endif	/* CONFIG_KGDB_KDB */
+
 static const struct file_operations kallsyms_operations = {
 	.open = kallsyms_open,
 	.read = seq_read,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ef077fb7315..c0613f7d673 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -41,7 +41,7 @@
 #include <asm/sections.h>
 
 /* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t* crash_notes;
+note_buf_t __percpu *crash_notes;
 
 /* vmcoreinfo stuff */
 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
@@ -151,8 +151,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 	image->nr_segments = nr_segments;
 	segment_bytes = nr_segments * sizeof(*segments);
 	result = copy_from_user(image->segment, segments, segment_bytes);
-	if (result)
+	if (result) {
+		result = -EFAULT;
 		goto out;
+	}
 
 	/*
 	 * Verify we have good destination addresses.  The caller is
@@ -827,7 +829,7 @@ static int kimage_load_normal_segment(struct kimage *image,
 		result = copy_from_user(ptr, buf, uchunk);
 		kunmap(page);
 		if (result) {
-			result = (result < 0) ? result : -EIO;
+			result = -EFAULT;
 			goto out;
 		}
 		ubytes -= uchunk;
@@ -882,7 +884,7 @@ static int kimage_load_crash_segment(struct kimage *image,
 		kexec_flush_icache_page(page);
 		kunmap(page);
 		if (result) {
-			result = (result < 0) ? result : -EIO;
+			result = -EFAULT;
 			goto out;
 		}
 		ubytes -= uchunk;
@@ -1089,9 +1091,10 @@ void crash_kexec(struct pt_regs *regs)
 
 size_t crash_get_memory_size(void)
 {
-	size_t size;
+	size_t size = 0;
 	mutex_lock(&kexec_mutex);
-	size = crashk_res.end - crashk_res.start + 1;
+	if (crashk_res.end != crashk_res.start)
+		size = crashk_res.end - crashk_res.start + 1;
 	mutex_unlock(&kexec_mutex);
 	return size;
 }
@@ -1134,11 +1137,9 @@ int crash_shrink_memory(unsigned long new_size)
 
 	free_reserved_phys_range(end, crashk_res.end);
 
-	if (start == end) {
-		crashk_res.end = end;
+	if ((start == end) && (crashk_res.parent != NULL))
 		release_resource(&crashk_res);
-	} else
-		crashk_res.end = end - 1;
+	crashk_res.end = end - 1;
 
 unlock:
 	mutex_unlock(&kexec_mutex);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 32c5c15d750..01a0700e873 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,8 +1,7 @@
 /*
- * A generic kernel FIFO implementation.
+ * A generic kernel FIFO implementation
  *
- * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
- * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
+ * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -11,7 +10,7 @@
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
@@ -24,420 +23,586 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/err.h>
-#include <linux/kfifo.h>
 #include <linux/log2.h>
 #include <linux/uaccess.h>
+#include <linux/kfifo.h>
 
-static void _kfifo_init(struct kfifo *fifo, void *buffer,
-		unsigned int size)
-{
-	fifo->buffer = buffer;
-	fifo->size = size;
-
-	kfifo_reset(fifo);
-}
-
-/**
- * kfifo_init - initialize a FIFO using a preallocated buffer
- * @fifo: the fifo to assign the buffer
- * @buffer: the preallocated buffer to be used.
- * @size: the size of the internal buffer, this has to be a power of 2.
- *
+/*
+ * internal helper to calculate the unused elements in a fifo
  */
-void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
+static inline unsigned int kfifo_unused(struct __kfifo *fifo)
 {
-	/* size must be a power of 2 */
-	BUG_ON(!is_power_of_2(size));
-
-	_kfifo_init(fifo, buffer, size);
+	return (fifo->mask + 1) - (fifo->in - fifo->out);
 }
-EXPORT_SYMBOL(kfifo_init);
 
-/**
- * kfifo_alloc - allocates a new FIFO internal buffer
- * @fifo: the fifo to assign then new buffer
- * @size: the size of the buffer to be allocated, this have to be a power of 2.
- * @gfp_mask: get_free_pages mask, passed to kmalloc()
- *
- * This function dynamically allocates a new fifo internal buffer
- *
- * The size will be rounded-up to a power of 2.
- * The buffer will be release with kfifo_free().
- * Return 0 if no error, otherwise the an error code
- */
-int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
+int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
+		size_t esize, gfp_t gfp_mask)
 {
-	unsigned char *buffer;
-
 	/*
-	 * round up to the next power of 2, since our 'let the indices
+	 * round down to the next power of 2, since our 'let the indices
 	 * wrap' technique works only in this case.
 	 */
-	if (!is_power_of_2(size)) {
-		BUG_ON(size > 0x80000000);
-		size = roundup_pow_of_two(size);
+	if (!is_power_of_2(size))
+		size = rounddown_pow_of_two(size);
+
+	fifo->in = 0;
+	fifo->out = 0;
+	fifo->esize = esize;
+
+	if (size < 2) {
+		fifo->data = NULL;
+		fifo->mask = 0;
+		return -EINVAL;
 	}
 
-	buffer = kmalloc(size, gfp_mask);
-	if (!buffer) {
-		_kfifo_init(fifo, 0, 0);
+	fifo->data = kmalloc(size * esize, gfp_mask);
+
+	if (!fifo->data) {
+		fifo->mask = 0;
 		return -ENOMEM;
 	}
-
-	_kfifo_init(fifo, buffer, size);
+	fifo->mask = size - 1;
 
 	return 0;
 }
-EXPORT_SYMBOL(kfifo_alloc);
+EXPORT_SYMBOL(__kfifo_alloc);
 
-/**
- * kfifo_free - frees the FIFO internal buffer
- * @fifo: the fifo to be freed.
- */
-void kfifo_free(struct kfifo *fifo)
+void __kfifo_free(struct __kfifo *fifo)
 {
-	kfree(fifo->buffer);
+	kfree(fifo->data);
+	fifo->in = 0;
+	fifo->out = 0;
+	fifo->esize = 0;
+	fifo->data = NULL;
+	fifo->mask = 0;
 }
-EXPORT_SYMBOL(kfifo_free);
+EXPORT_SYMBOL(__kfifo_free);
 
-/**
- * kfifo_skip - skip output data
- * @fifo: the fifo to be used.
- * @len: number of bytes to skip
- */
-void kfifo_skip(struct kfifo *fifo, unsigned int len)
+int __kfifo_init(struct __kfifo *fifo, void *buffer,
+		unsigned int size, size_t esize)
 {
-	if (len < kfifo_len(fifo)) {
-		__kfifo_add_out(fifo, len);
-		return;
+	size /= esize;
+
+	if (!is_power_of_2(size))
+		size = rounddown_pow_of_two(size);
+
+	fifo->in = 0;
+	fifo->out = 0;
+	fifo->esize = esize;
+	fifo->data = buffer;
+
+	if (size < 2) {
+		fifo->mask = 0;
+		return -EINVAL;
 	}
-	kfifo_reset_out(fifo);
+	fifo->mask = size - 1;
+
+	return 0;
 }
-EXPORT_SYMBOL(kfifo_skip);
+EXPORT_SYMBOL(__kfifo_init);
 
-static inline void __kfifo_in_data(struct kfifo *fifo,
-		const void *from, unsigned int len, unsigned int off)
+static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
+		unsigned int len, unsigned int off)
 {
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
 	unsigned int l;
 
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	memcpy(fifo->data + off, src, l);
+	memcpy(fifo->data, src + l, len - l);
 	/*
-	 * Ensure that we sample the fifo->out index -before- we
-	 * start putting bytes into the kfifo.
+	 * make sure that the data in the fifo is up to date before
+	 * incrementing the fifo->in index counter
 	 */
+	smp_wmb();
+}
 
-	smp_mb();
-
-	off = __kfifo_off(fifo, fifo->in + off);
+unsigned int __kfifo_in(struct __kfifo *fifo,
+		const void *buf, unsigned int len)
+{
+	unsigned int l;
 
-	/* first put the data starting from fifo->in to buffer end */
-	l = min(len, fifo->size - off);
-	memcpy(fifo->buffer + off, from, l);
+	l = kfifo_unused(fifo);
+	if (len > l)
+		len = l;
 
-	/* then put the rest (if any) at the beginning of the buffer */
-	memcpy(fifo->buffer, from + l, len - l);
+	kfifo_copy_in(fifo, buf, len, fifo->in);
+	fifo->in += len;
+	return len;
 }
+EXPORT_SYMBOL(__kfifo_in);
 
-static inline void __kfifo_out_data(struct kfifo *fifo,
-		void *to, unsigned int len, unsigned int off)
+static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
+		unsigned int len, unsigned int off)
 {
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
 	unsigned int l;
 
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	memcpy(dst, fifo->data + off, l);
+	memcpy(dst + l, fifo->data, len - l);
 	/*
-	 * Ensure that we sample the fifo->in index -before- we
-	 * start removing bytes from the kfifo.
+	 * make sure that the data is copied before
+	 * incrementing the fifo->out index counter
 	 */
+	smp_wmb();
+}
 
-	smp_rmb();
+unsigned int __kfifo_out_peek(struct __kfifo *fifo,
+		void *buf, unsigned int len)
+{
+	unsigned int l;
 
-	off = __kfifo_off(fifo, fifo->out + off);
+	l = fifo->in - fifo->out;
+	if (len > l)
+		len = l;
 
-	/* first get the data from fifo->out until the end of the buffer */
-	l = min(len, fifo->size - off);
-	memcpy(to, fifo->buffer + off, l);
+	kfifo_copy_out(fifo, buf, len, fifo->out);
+	return len;
+}
+EXPORT_SYMBOL(__kfifo_out_peek);
 
-	/* then get the rest (if any) from the beginning of the buffer */
-	memcpy(to + l, fifo->buffer, len - l);
+unsigned int __kfifo_out(struct __kfifo *fifo,
+		void *buf, unsigned int len)
+{
+	len = __kfifo_out_peek(fifo, buf, len);
+	fifo->out += len;
+	return len;
 }
+EXPORT_SYMBOL(__kfifo_out);
 
-static inline int __kfifo_from_user_data(struct kfifo *fifo,
-	 const void __user *from, unsigned int len, unsigned int off,
-	 unsigned *lenout)
+static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
+	const void __user *from, unsigned int len, unsigned int off,
+	unsigned int *copied)
 {
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
 	unsigned int l;
-	int ret;
+	unsigned long ret;
 
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	ret = copy_from_user(fifo->data + off, from, l);
+	if (unlikely(ret))
+		ret = DIV_ROUND_UP(ret + len - l, esize);
+	else {
+		ret = copy_from_user(fifo->data, from + l, len - l);
+		if (unlikely(ret))
+			ret = DIV_ROUND_UP(ret, esize);
+	}
 	/*
-	 * Ensure that we sample the fifo->out index -before- we
-	 * start putting bytes into the kfifo.
+	 * make sure that the data in the fifo is up to date before
+	 * incrementing the fifo->in index counter
 	 */
+	smp_wmb();
+	*copied = len - ret;
+	/* return the number of elements which are not copied */
+	return ret;
+}
 
-	smp_mb();
+int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
+		unsigned long len, unsigned int *copied)
+{
+	unsigned int l;
+	unsigned long ret;
+	unsigned int esize = fifo->esize;
+	int err;
 
-	off = __kfifo_off(fifo, fifo->in + off);
+	if (esize != 1)
+		len /= esize;
 
-	/* first put the data starting from fifo->in to buffer end */
-	l = min(len, fifo->size - off);
-	ret = copy_from_user(fifo->buffer + off, from, l);
-	if (unlikely(ret)) {
-		*lenout = ret;
-		return -EFAULT;
-	}
-	*lenout = l;
+	l = kfifo_unused(fifo);
+	if (len > l)
+		len = l;
 
-	/* then put the rest (if any) at the beginning of the buffer */
-	ret = copy_from_user(fifo->buffer, from + l, len - l);
-	*lenout += ret ? ret : len - l;
-	return ret ? -EFAULT : 0;
+	ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
+	if (unlikely(ret)) {
+		len -= ret;
+		err = -EFAULT;
+	} else
+		err = 0;
+	fifo->in += len;
+	return err;
 }
+EXPORT_SYMBOL(__kfifo_from_user);
 
-static inline int __kfifo_to_user_data(struct kfifo *fifo,
-		void __user *to, unsigned int len, unsigned int off, unsigned *lenout)
+static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
+		unsigned int len, unsigned int off, unsigned int *copied)
 {
 	unsigned int l;
-	int ret;
-
+	unsigned long ret;
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
+
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	ret = copy_to_user(to, fifo->data + off, l);
+	if (unlikely(ret))
+		ret = DIV_ROUND_UP(ret + len - l, esize);
+	else {
+		ret = copy_to_user(to + l, fifo->data, len - l);
+		if (unlikely(ret))
+			ret = DIV_ROUND_UP(ret, esize);
+	}
 	/*
-	 * Ensure that we sample the fifo->in index -before- we
-	 * start removing bytes from the kfifo.
+	 * make sure that the data is copied before
+	 * incrementing the fifo->out index counter
 	 */
+	smp_wmb();
+	*copied = len - ret;
+	/* return the number of elements which are not copied */
+	return ret;
+}
 
-	smp_rmb();
+int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
+		unsigned long len, unsigned int *copied)
+{
+	unsigned int l;
+	unsigned long ret;
+	unsigned int esize = fifo->esize;
+	int err;
 
-	off = __kfifo_off(fifo, fifo->out + off);
+	if (esize != 1)
+		len /= esize;
 
-	/* first get the data from fifo->out until the end of the buffer */
-	l = min(len, fifo->size - off);
-	ret = copy_to_user(to, fifo->buffer + off, l);
-	*lenout = l;
+	l = fifo->in - fifo->out;
+	if (len > l)
+		len = l;
+	ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
 	if (unlikely(ret)) {
-		*lenout -= ret;
-		return -EFAULT;
-	}
+		len -= ret;
+		err = -EFAULT;
+	} else
+		err = 0;
+	fifo->out += len;
+	return err;
+}
+EXPORT_SYMBOL(__kfifo_to_user);
 
-	/* then get the rest (if any) from the beginning of the buffer */
-	len -= l;
-	ret = copy_to_user(to + l, fifo->buffer, len);
-	if (unlikely(ret)) {
-		*lenout += len - ret;
-		return -EFAULT;
+static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
+		int nents, unsigned int len)
+{
+	int n;
+	unsigned int l;
+	unsigned int off;
+	struct page *page;
+
+	if (!nents)
+		return 0;
+
+	if (!len)
+		return 0;
+
+	n = 0;
+	page = virt_to_page(buf);
+	off = offset_in_page(buf);
+	l = 0;
+
+	while (len >= l + PAGE_SIZE - off) {
+		struct page *npage;
+
+		l += PAGE_SIZE;
+		buf += PAGE_SIZE;
+		npage = virt_to_page(buf);
+		if (page_to_phys(page) != page_to_phys(npage) - l) {
+			sg_set_page(sgl, page, l - off, off);
+			sgl = sg_next(sgl);
+			if (++n == nents || sgl == NULL)
+				return n;
+			page = npage;
+			len -= l - off;
+			l = off = 0;
+		}
 	}
-	*lenout += len;
-	return 0;
+	sg_set_page(sgl, page, len, off);
+	return n + 1;
 }
 
-unsigned int __kfifo_in_n(struct kfifo *fifo,
-	const void *from, unsigned int len, unsigned int recsize)
+static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
+		int nents, unsigned int len, unsigned int off)
 {
-	if (kfifo_avail(fifo) < len + recsize)
-		return len + 1;
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
+	unsigned int l;
+	unsigned int n;
 
-	__kfifo_in_data(fifo, from, len, recsize);
-	return 0;
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
+	n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
+
+	return n;
 }
-EXPORT_SYMBOL(__kfifo_in_n);
 
-/**
- * kfifo_in - puts some data into the FIFO
- * @fifo: the fifo to be used.
- * @from: the data to be added.
- * @len: the length of the data to be added.
- *
- * This function copies at most @len bytes from the @from buffer into
- * the FIFO depending on the free space, and returns the number of
- * bytes copied.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-unsigned int kfifo_in(struct kfifo *fifo, const void *from,
-				unsigned int len)
+unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
+		struct scatterlist *sgl, int nents, unsigned int len)
 {
-	len = min(kfifo_avail(fifo), len);
+	unsigned int l;
 
-	__kfifo_in_data(fifo, from, len, 0);
-	__kfifo_add_in(fifo, len);
-	return len;
+	l = kfifo_unused(fifo);
+	if (len > l)
+		len = l;
+
+	return setup_sgl(fifo, sgl, nents, len, fifo->in);
 }
-EXPORT_SYMBOL(kfifo_in);
+EXPORT_SYMBOL(__kfifo_dma_in_prepare);
 
-unsigned int __kfifo_in_generic(struct kfifo *fifo,
-	const void *from, unsigned int len, unsigned int recsize)
+unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
+		struct scatterlist *sgl, int nents, unsigned int len)
 {
-	return __kfifo_in_rec(fifo, from, len, recsize);
+	unsigned int l;
+
+	l = fifo->in - fifo->out;
+	if (len > l)
+		len = l;
+
+	return setup_sgl(fifo, sgl, nents, len, fifo->out);
 }
-EXPORT_SYMBOL(__kfifo_in_generic);
+EXPORT_SYMBOL(__kfifo_dma_out_prepare);
 
-unsigned int __kfifo_out_n(struct kfifo *fifo,
-	void *to, unsigned int len, unsigned int recsize)
+unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
 {
-	if (kfifo_len(fifo) < len + recsize)
-		return len;
+	unsigned int max = (1 << (recsize << 3)) - 1;
 
-	__kfifo_out_data(fifo, to, len, recsize);
-	__kfifo_add_out(fifo, len + recsize);
-	return 0;
+	if (len > max)
+		return max;
+	return len;
 }
-EXPORT_SYMBOL(__kfifo_out_n);
 
-/**
- * kfifo_out - gets some data from the FIFO
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @len: the size of the destination buffer.
- *
- * This function copies at most @len bytes from the FIFO into the
- * @to buffer and returns the number of copied bytes.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
+#define	__KFIFO_PEEK(data, out, mask) \
+	((data)[(out) & (mask)])
+/*
+ * __kfifo_peek_n internal helper function for determinate the length of
+ * the next record in the fifo
  */
-unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len)
+static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
 {
-	len = min(kfifo_len(fifo), len);
+	unsigned int l;
+	unsigned int mask = fifo->mask;
+	unsigned char *data = fifo->data;
 
-	__kfifo_out_data(fifo, to, len, 0);
-	__kfifo_add_out(fifo, len);
+	l = __KFIFO_PEEK(data, fifo->out, mask);
 
-	return len;
+	if (--recsize)
+		l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
+
+	return l;
 }
-EXPORT_SYMBOL(kfifo_out);
 
-/**
- * kfifo_out_peek - copy some data from the FIFO, but do not remove it
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @len: the size of the destination buffer.
- * @offset: offset into the fifo
- *
- * This function copies at most @len bytes at @offset from the FIFO
- * into the @to buffer and returns the number of copied bytes.
- * The data is not removed from the FIFO.
+#define	__KFIFO_POKE(data, in, mask, val) \
+	( \
+	(data)[(in) & (mask)] = (unsigned char)(val) \
+	)
+
+/*
+ * __kfifo_poke_n internal helper function for storeing the length of
+ * the record into the fifo
  */
-unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len,
-			    unsigned offset)
+static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
 {
-	len = min(kfifo_len(fifo), len + offset);
+	unsigned int mask = fifo->mask;
+	unsigned char *data = fifo->data;
 
-	__kfifo_out_data(fifo, to, len, offset);
-	return len;
+	__KFIFO_POKE(data, fifo->in, mask, n);
+
+	if (recsize > 1)
+		__KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
 }
-EXPORT_SYMBOL(kfifo_out_peek);
 
-unsigned int __kfifo_out_generic(struct kfifo *fifo,
-	void *to, unsigned int len, unsigned int recsize,
-	unsigned int *total)
+unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
 {
-	return __kfifo_out_rec(fifo, to, len, recsize, total);
+	return __kfifo_peek_n(fifo, recsize);
 }
-EXPORT_SYMBOL(__kfifo_out_generic);
+EXPORT_SYMBOL(__kfifo_len_r);
 
-unsigned int __kfifo_from_user_n(struct kfifo *fifo,
-	const void __user *from, unsigned int len, unsigned int recsize)
+unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
+		unsigned int len, size_t recsize)
 {
-	unsigned total;
+	if (len + recsize > kfifo_unused(fifo))
+		return 0;
 
-	if (kfifo_avail(fifo) < len + recsize)
-		return len + 1;
+	__kfifo_poke_n(fifo, len, recsize);
 
-	__kfifo_from_user_data(fifo, from, len, recsize, &total);
-	return total;
+	kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
+	fifo->in += len + recsize;
+	return len;
 }
-EXPORT_SYMBOL(__kfifo_from_user_n);
+EXPORT_SYMBOL(__kfifo_in_r);
 
-/**
- * kfifo_from_user - puts some data from user space into the FIFO
- * @fifo: the fifo to be used.
- * @from: pointer to the data to be added.
- * @len: the length of the data to be added.
- *
- * This function copies at most @len bytes from the @from into the
- * FIFO depending and returns -EFAULT/0.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-int kfifo_from_user(struct kfifo *fifo,
-        const void __user *from, unsigned int len, unsigned *total)
-{
-	int ret;
-	len = min(kfifo_avail(fifo), len);
-	ret = __kfifo_from_user_data(fifo, from, len, 0, total);
-	if (ret)
-		return ret;
-	__kfifo_add_in(fifo, len);
-	return 0;
+static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
+	void *buf, unsigned int len, size_t recsize, unsigned int *n)
+{
+	*n = __kfifo_peek_n(fifo, recsize);
+
+	if (len > *n)
+		len = *n;
+
+	kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
+	return len;
 }
-EXPORT_SYMBOL(kfifo_from_user);
 
-unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
-	const void __user *from, unsigned int len, unsigned int recsize)
+unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
+		unsigned int len, size_t recsize)
 {
-	return __kfifo_from_user_rec(fifo, from, len, recsize);
+	unsigned int n;
+
+	if (fifo->in == fifo->out)
+		return 0;
+
+	return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
 }
-EXPORT_SYMBOL(__kfifo_from_user_generic);
+EXPORT_SYMBOL(__kfifo_out_peek_r);
 
-unsigned int __kfifo_to_user_n(struct kfifo *fifo,
-	void __user *to, unsigned int len, unsigned int reclen,
-	unsigned int recsize)
+unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
+		unsigned int len, size_t recsize)
 {
-	unsigned int ret, total;
+	unsigned int n;
 
-	if (kfifo_len(fifo) < reclen + recsize)
-		return len;
+	if (fifo->in == fifo->out)
+		return 0;
 
-	ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total);
+	len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
+	fifo->out += n + recsize;
+	return len;
+}
+EXPORT_SYMBOL(__kfifo_out_r);
 
-	if (likely(ret == 0))
-		__kfifo_add_out(fifo, reclen + recsize);
+void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
+{
+	unsigned int n;
 
-	return total;
+	n = __kfifo_peek_n(fifo, recsize);
+	fifo->out += n + recsize;
 }
-EXPORT_SYMBOL(__kfifo_to_user_n);
+EXPORT_SYMBOL(__kfifo_skip_r);
 
-/**
- * kfifo_to_user - gets data from the FIFO and write it to user space
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @len: the size of the destination buffer.
- @ @lenout: pointer to output variable with copied data
- *
- * This function copies at most @len bytes from the FIFO into the
- * @to buffer and 0 or -EFAULT.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-int kfifo_to_user(struct kfifo *fifo,
-	void __user *to, unsigned int len, unsigned *lenout)
+int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
+	unsigned long len, unsigned int *copied, size_t recsize)
 {
-	int ret;
-	len = min(kfifo_len(fifo), len);
-	ret = __kfifo_to_user_data(fifo, to, len, 0, lenout);
-	__kfifo_add_out(fifo, *lenout);
-	return ret;
+	unsigned long ret;
+
+	len = __kfifo_max_r(len, recsize);
+
+	if (len + recsize > kfifo_unused(fifo)) {
+		*copied = 0;
+		return 0;
+	}
+
+	__kfifo_poke_n(fifo, len, recsize);
+
+	ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
+	if (unlikely(ret)) {
+		*copied = 0;
+		return -EFAULT;
+	}
+	fifo->in += len + recsize;
+	return 0;
 }
-EXPORT_SYMBOL(kfifo_to_user);
+EXPORT_SYMBOL(__kfifo_from_user_r);
 
-unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
-	void __user *to, unsigned int len, unsigned int recsize,
-	unsigned int *total)
+int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
+	unsigned long len, unsigned int *copied, size_t recsize)
 {
-	return __kfifo_to_user_rec(fifo, to, len, recsize, total);
+	unsigned long ret;
+	unsigned int n;
+
+	if (fifo->in == fifo->out) {
+		*copied = 0;
+		return 0;
+	}
+
+	n = __kfifo_peek_n(fifo, recsize);
+	if (len > n)
+		len = n;
+
+	ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
+	if (unlikely(ret)) {
+		*copied = 0;
+		return -EFAULT;
+	}
+	fifo->out += n + recsize;
+	return 0;
 }
-EXPORT_SYMBOL(__kfifo_to_user_generic);
+EXPORT_SYMBOL(__kfifo_to_user_r);
 
-unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize)
+unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
 {
-	if (recsize == 0)
-		return kfifo_avail(fifo);
+	if (!nents)
+		BUG();
 
-	return __kfifo_peek_n(fifo, recsize);
+	len = __kfifo_max_r(len, recsize);
+
+	if (len + recsize > kfifo_unused(fifo))
+		return 0;
+
+	return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
 }
-EXPORT_SYMBOL(__kfifo_peek_generic);
+EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
 
-void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize)
+void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
+	unsigned int len, size_t recsize)
 {
-	__kfifo_skip_rec(fifo, recsize);
+	len = __kfifo_max_r(len, recsize);
+	__kfifo_poke_n(fifo, len, recsize);
+	fifo->in += len + recsize;
 }
-EXPORT_SYMBOL(__kfifo_skip_generic);
+EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
 
+unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
+{
+	if (!nents)
+		BUG();
+
+	len = __kfifo_max_r(len, recsize);
+
+	if (len + recsize > fifo->in - fifo->out)
+		return 0;
+
+	return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
+}
+EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
+
+void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
+{
+	unsigned int len;
+
+	len = __kfifo_peek_n(fifo, recsize);
+	fifo->out += len + recsize;
+}
+EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
deleted file mode 100644
index 2eb517e2351..00000000000
--- a/kernel/kgdb.c
+++ /dev/null
@@ -1,1760 +0,0 @@
-/*
- * KGDB stub.
- *
- * Maintainer: Jason Wessel <jason.wessel@windriver.com>
- *
- * Copyright (C) 2000-2001 VERITAS Software Corporation.
- * Copyright (C) 2002-2004 Timesys Corporation
- * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
- * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
- * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
- * Copyright (C) 2005-2008 Wind River Systems, Inc.
- * Copyright (C) 2007 MontaVista Software, Inc.
- * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * Contributors at various stages not listed above:
- *  Jason Wessel ( jason.wessel@windriver.com )
- *  George Anzinger <george@mvista.com>
- *  Anurekh Saxena (anurekh.saxena@timesys.com)
- *  Lake Stevens Instrument Division (Glenn Engel)
- *  Jim Kingdon, Cygnus Support.
- *
- * Original KGDB stub: David Grothe <dave@gcom.com>,
- * Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-#include <linux/pid_namespace.h>
-#include <linux/clocksource.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/console.h>
-#include <linux/threads.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ptrace.h>
-#include <linux/reboot.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/sysrq.h>
-#include <linux/init.h>
-#include <linux/kgdb.h>
-#include <linux/pid.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-
-#include <asm/cacheflush.h>
-#include <asm/byteorder.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/unaligned.h>
-
-static int kgdb_break_asap;
-
-#define KGDB_MAX_THREAD_QUERY 17
-struct kgdb_state {
-	int			ex_vector;
-	int			signo;
-	int			err_code;
-	int			cpu;
-	int			pass_exception;
-	unsigned long		thr_query;
-	unsigned long		threadid;
-	long			kgdb_usethreadid;
-	struct pt_regs		*linux_regs;
-};
-
-static struct debuggerinfo_struct {
-	void			*debuggerinfo;
-	struct task_struct	*task;
-} kgdb_info[NR_CPUS];
-
-/**
- * kgdb_connected - Is a host GDB connected to us?
- */
-int				kgdb_connected;
-EXPORT_SYMBOL_GPL(kgdb_connected);
-
-/* All the KGDB handlers are installed */
-static int			kgdb_io_module_registered;
-
-/* Guard for recursive entry */
-static int			exception_level;
-
-static struct kgdb_io		*kgdb_io_ops;
-static DEFINE_SPINLOCK(kgdb_registration_lock);
-
-/* kgdb console driver is loaded */
-static int kgdb_con_registered;
-/* determine if kgdb console output should be used */
-static int kgdb_use_con;
-
-static int __init opt_kgdb_con(char *str)
-{
-	kgdb_use_con = 1;
-	return 0;
-}
-
-early_param("kgdbcon", opt_kgdb_con);
-
-module_param(kgdb_use_con, int, 0644);
-
-/*
- * Holds information about breakpoints in a kernel. These breakpoints are
- * added and removed by gdb.
- */
-static struct kgdb_bkpt		kgdb_break[KGDB_MAX_BREAKPOINTS] = {
-	[0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
-};
-
-/*
- * The CPU# of the active CPU, or -1 if none:
- */
-atomic_t			kgdb_active = ATOMIC_INIT(-1);
-
-/*
- * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
- * bootup code (which might not have percpu set up yet):
- */
-static atomic_t			passive_cpu_wait[NR_CPUS];
-static atomic_t			cpu_in_kgdb[NR_CPUS];
-atomic_t			kgdb_setting_breakpoint;
-
-struct task_struct		*kgdb_usethread;
-struct task_struct		*kgdb_contthread;
-
-int				kgdb_single_step;
-pid_t				kgdb_sstep_pid;
-
-/* Our I/O buffers. */
-static char			remcom_in_buffer[BUFMAX];
-static char			remcom_out_buffer[BUFMAX];
-
-/* Storage for the registers, in GDB format. */
-static unsigned long		gdb_regs[(NUMREGBYTES +
-					sizeof(unsigned long) - 1) /
-					sizeof(unsigned long)];
-
-/* to keep track of the CPU which is doing the single stepping*/
-atomic_t			kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
-
-/*
- * If you are debugging a problem where roundup (the collection of
- * all other CPUs) is a problem [this should be extremely rare],
- * then use the nokgdbroundup option to avoid roundup. In that case
- * the other CPUs might interfere with your debugging context, so
- * use this with care:
- */
-static int kgdb_do_roundup = 1;
-
-static int __init opt_nokgdbroundup(char *str)
-{
-	kgdb_do_roundup = 0;
-
-	return 0;
-}
-
-early_param("nokgdbroundup", opt_nokgdbroundup);
-
-/*
- * Finally, some KGDB code :-)
- */
-
-/*
- * Weak aliases for breakpoint management,
- * can be overriden by architectures when needed:
- */
-int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
-{
-	int err;
-
-	err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
-	if (err)
-		return err;
-
-	return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
-				  BREAK_INSTR_SIZE);
-}
-
-int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
-{
-	return probe_kernel_write((char *)addr,
-				  (char *)bundle, BREAK_INSTR_SIZE);
-}
-
-int __weak kgdb_validate_break_address(unsigned long addr)
-{
-	char tmp_variable[BREAK_INSTR_SIZE];
-	int err;
-	/* Validate setting the breakpoint and then removing it.  In the
-	 * remove fails, the kernel needs to emit a bad message because we
-	 * are deep trouble not being able to put things back the way we
-	 * found them.
-	 */
-	err = kgdb_arch_set_breakpoint(addr, tmp_variable);
-	if (err)
-		return err;
-	err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
-	if (err)
-		printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
-		   "memory destroyed at: %lx", addr);
-	return err;
-}
-
-unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
-{
-	return instruction_pointer(regs);
-}
-
-int __weak kgdb_arch_init(void)
-{
-	return 0;
-}
-
-int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
-{
-	return 0;
-}
-
-void __weak
-kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
-{
-	return;
-}
-
-/**
- *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
- *	@regs: Current &struct pt_regs.
- *
- *	This function will be called if the particular architecture must
- *	disable hardware debugging while it is processing gdb packets or
- *	handling exception.
- */
-void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
-{
-}
-
-/*
- * GDB remote protocol parser:
- */
-
-static int hex(char ch)
-{
-	if ((ch >= 'a') && (ch <= 'f'))
-		return ch - 'a' + 10;
-	if ((ch >= '0') && (ch <= '9'))
-		return ch - '0';
-	if ((ch >= 'A') && (ch <= 'F'))
-		return ch - 'A' + 10;
-	return -1;
-}
-
-/* scan for the sequence $<data>#<checksum> */
-static void get_packet(char *buffer)
-{
-	unsigned char checksum;
-	unsigned char xmitcsum;
-	int count;
-	char ch;
-
-	do {
-		/*
-		 * Spin and wait around for the start character, ignore all
-		 * other characters:
-		 */
-		while ((ch = (kgdb_io_ops->read_char())) != '$')
-			/* nothing */;
-
-		kgdb_connected = 1;
-		checksum = 0;
-		xmitcsum = -1;
-
-		count = 0;
-
-		/*
-		 * now, read until a # or end of buffer is found:
-		 */
-		while (count < (BUFMAX - 1)) {
-			ch = kgdb_io_ops->read_char();
-			if (ch == '#')
-				break;
-			checksum = checksum + ch;
-			buffer[count] = ch;
-			count = count + 1;
-		}
-		buffer[count] = 0;
-
-		if (ch == '#') {
-			xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
-			xmitcsum += hex(kgdb_io_ops->read_char());
-
-			if (checksum != xmitcsum)
-				/* failed checksum */
-				kgdb_io_ops->write_char('-');
-			else
-				/* successful transfer */
-				kgdb_io_ops->write_char('+');
-			if (kgdb_io_ops->flush)
-				kgdb_io_ops->flush();
-		}
-	} while (checksum != xmitcsum);
-}
-
-/*
- * Send the packet in buffer.
- * Check for gdb connection if asked for.
- */
-static void put_packet(char *buffer)
-{
-	unsigned char checksum;
-	int count;
-	char ch;
-
-	/*
-	 * $<packet info>#<checksum>.
-	 */
-	while (1) {
-		kgdb_io_ops->write_char('$');
-		checksum = 0;
-		count = 0;
-
-		while ((ch = buffer[count])) {
-			kgdb_io_ops->write_char(ch);
-			checksum += ch;
-			count++;
-		}
-
-		kgdb_io_ops->write_char('#');
-		kgdb_io_ops->write_char(hex_asc_hi(checksum));
-		kgdb_io_ops->write_char(hex_asc_lo(checksum));
-		if (kgdb_io_ops->flush)
-			kgdb_io_ops->flush();
-
-		/* Now see what we get in reply. */
-		ch = kgdb_io_ops->read_char();
-
-		if (ch == 3)
-			ch = kgdb_io_ops->read_char();
-
-		/* If we get an ACK, we are done. */
-		if (ch == '+')
-			return;
-
-		/*
-		 * If we get the start of another packet, this means
-		 * that GDB is attempting to reconnect.  We will NAK
-		 * the packet being sent, and stop trying to send this
-		 * packet.
-		 */
-		if (ch == '$') {
-			kgdb_io_ops->write_char('-');
-			if (kgdb_io_ops->flush)
-				kgdb_io_ops->flush();
-			return;
-		}
-	}
-}
-
-/*
- * Convert the memory pointed to by mem into hex, placing result in buf.
- * Return a pointer to the last char put in buf (null). May return an error.
- */
-int kgdb_mem2hex(char *mem, char *buf, int count)
-{
-	char *tmp;
-	int err;
-
-	/*
-	 * We use the upper half of buf as an intermediate buffer for the
-	 * raw memory copy.  Hex conversion will work against this one.
-	 */
-	tmp = buf + count;
-
-	err = probe_kernel_read(tmp, mem, count);
-	if (!err) {
-		while (count > 0) {
-			buf = pack_hex_byte(buf, *tmp);
-			tmp++;
-			count--;
-		}
-
-		*buf = 0;
-	}
-
-	return err;
-}
-
-/*
- * Copy the binary array pointed to by buf into mem.  Fix $, #, and
- * 0x7d escaped with 0x7d.  Return a pointer to the character after
- * the last byte written.
- */
-static int kgdb_ebin2mem(char *buf, char *mem, int count)
-{
-	int err = 0;
-	char c;
-
-	while (count-- > 0) {
-		c = *buf++;
-		if (c == 0x7d)
-			c = *buf++ ^ 0x20;
-
-		err = probe_kernel_write(mem, &c, 1);
-		if (err)
-			break;
-
-		mem++;
-	}
-
-	return err;
-}
-
-/*
- * Convert the hex array pointed to by buf into binary to be placed in mem.
- * Return a pointer to the character AFTER the last byte written.
- * May return an error.
- */
-int kgdb_hex2mem(char *buf, char *mem, int count)
-{
-	char *tmp_raw;
-	char *tmp_hex;
-
-	/*
-	 * We use the upper half of buf as an intermediate buffer for the
-	 * raw memory that is converted from hex.
-	 */
-	tmp_raw = buf + count * 2;
-
-	tmp_hex = tmp_raw - 1;
-	while (tmp_hex >= buf) {
-		tmp_raw--;
-		*tmp_raw = hex(*tmp_hex--);
-		*tmp_raw |= hex(*tmp_hex--) << 4;
-	}
-
-	return probe_kernel_write(mem, tmp_raw, count);
-}
-
-/*
- * While we find nice hex chars, build a long_val.
- * Return number of chars processed.
- */
-int kgdb_hex2long(char **ptr, unsigned long *long_val)
-{
-	int hex_val;
-	int num = 0;
-	int negate = 0;
-
-	*long_val = 0;
-
-	if (**ptr == '-') {
-		negate = 1;
-		(*ptr)++;
-	}
-	while (**ptr) {
-		hex_val = hex(**ptr);
-		if (hex_val < 0)
-			break;
-
-		*long_val = (*long_val << 4) | hex_val;
-		num++;
-		(*ptr)++;
-	}
-
-	if (negate)
-		*long_val = -*long_val;
-
-	return num;
-}
-
-/* Write memory due to an 'M' or 'X' packet. */
-static int write_mem_msg(int binary)
-{
-	char *ptr = &remcom_in_buffer[1];
-	unsigned long addr;
-	unsigned long length;
-	int err;
-
-	if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
-	    kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
-		if (binary)
-			err = kgdb_ebin2mem(ptr, (char *)addr, length);
-		else
-			err = kgdb_hex2mem(ptr, (char *)addr, length);
-		if (err)
-			return err;
-		if (CACHE_FLUSH_IS_SAFE)
-			flush_icache_range(addr, addr + length);
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
-static void error_packet(char *pkt, int error)
-{
-	error = -error;
-	pkt[0] = 'E';
-	pkt[1] = hex_asc[(error / 10)];
-	pkt[2] = hex_asc[(error % 10)];
-	pkt[3] = '\0';
-}
-
-/*
- * Thread ID accessors. We represent a flat TID space to GDB, where
- * the per CPU idle threads (which under Linux all have PID 0) are
- * remapped to negative TIDs.
- */
-
-#define BUF_THREAD_ID_SIZE	16
-
-static char *pack_threadid(char *pkt, unsigned char *id)
-{
-	char *limit;
-
-	limit = pkt + BUF_THREAD_ID_SIZE;
-	while (pkt < limit)
-		pkt = pack_hex_byte(pkt, *id++);
-
-	return pkt;
-}
-
-static void int_to_threadref(unsigned char *id, int value)
-{
-	unsigned char *scan;
-	int i = 4;
-
-	scan = (unsigned char *)id;
-	while (i--)
-		*scan++ = 0;
-	put_unaligned_be32(value, scan);
-}
-
-static struct task_struct *getthread(struct pt_regs *regs, int tid)
-{
-	/*
-	 * Non-positive TIDs are remapped to the cpu shadow information
-	 */
-	if (tid == 0 || tid == -1)
-		tid = -atomic_read(&kgdb_active) - 2;
-	if (tid < -1 && tid > -NR_CPUS - 2) {
-		if (kgdb_info[-tid - 2].task)
-			return kgdb_info[-tid - 2].task;
-		else
-			return idle_task(-tid - 2);
-	}
-	if (tid <= 0) {
-		printk(KERN_ERR "KGDB: Internal thread select error\n");
-		dump_stack();
-		return NULL;
-	}
-
-	/*
-	 * find_task_by_pid_ns() does not take the tasklist lock anymore
-	 * but is nicely RCU locked - hence is a pretty resilient
-	 * thing to use:
-	 */
-	return find_task_by_pid_ns(tid, &init_pid_ns);
-}
-
-/*
- * CPU debug state control:
- */
-
-#ifdef CONFIG_SMP
-static void kgdb_wait(struct pt_regs *regs)
-{
-	unsigned long flags;
-	int cpu;
-
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	kgdb_info[cpu].debuggerinfo = regs;
-	kgdb_info[cpu].task = current;
-	/*
-	 * Make sure the above info reaches the primary CPU before
-	 * our cpu_in_kgdb[] flag setting does:
-	 */
-	smp_wmb();
-	atomic_set(&cpu_in_kgdb[cpu], 1);
-
-	/* Wait till primary CPU is done with debugging */
-	while (atomic_read(&passive_cpu_wait[cpu]))
-		cpu_relax();
-
-	kgdb_info[cpu].debuggerinfo = NULL;
-	kgdb_info[cpu].task = NULL;
-
-	/* fix up hardware debug registers on local cpu */
-	if (arch_kgdb_ops.correct_hw_break)
-		arch_kgdb_ops.correct_hw_break();
-
-	/* Signal the primary CPU that we are done: */
-	atomic_set(&cpu_in_kgdb[cpu], 0);
-	touch_softlockup_watchdog();
-	clocksource_touch_watchdog();
-	local_irq_restore(flags);
-}
-#endif
-
-/*
- * Some architectures need cache flushes when we set/clear a
- * breakpoint:
- */
-static void kgdb_flush_swbreak_addr(unsigned long addr)
-{
-	if (!CACHE_FLUSH_IS_SAFE)
-		return;
-
-	if (current->mm && current->mm->mmap_cache) {
-		flush_cache_range(current->mm->mmap_cache,
-				  addr, addr + BREAK_INSTR_SIZE);
-	}
-	/* Force flush instruction cache if it was outside the mm */
-	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
-}
-
-/*
- * SW breakpoint management:
- */
-static int kgdb_activate_sw_breakpoints(void)
-{
-	unsigned long addr;
-	int error;
-	int ret = 0;
-	int i;
-
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if (kgdb_break[i].state != BP_SET)
-			continue;
-
-		addr = kgdb_break[i].bpt_addr;
-		error = kgdb_arch_set_breakpoint(addr,
-				kgdb_break[i].saved_instr);
-		if (error) {
-			ret = error;
-			printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
-			continue;
-		}
-
-		kgdb_flush_swbreak_addr(addr);
-		kgdb_break[i].state = BP_ACTIVE;
-	}
-	return ret;
-}
-
-static int kgdb_set_sw_break(unsigned long addr)
-{
-	int err = kgdb_validate_break_address(addr);
-	int breakno = -1;
-	int i;
-
-	if (err)
-		return err;
-
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if ((kgdb_break[i].state == BP_SET) &&
-					(kgdb_break[i].bpt_addr == addr))
-			return -EEXIST;
-	}
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if (kgdb_break[i].state == BP_REMOVED &&
-					kgdb_break[i].bpt_addr == addr) {
-			breakno = i;
-			break;
-		}
-	}
-
-	if (breakno == -1) {
-		for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-			if (kgdb_break[i].state == BP_UNDEFINED) {
-				breakno = i;
-				break;
-			}
-		}
-	}
-
-	if (breakno == -1)
-		return -E2BIG;
-
-	kgdb_break[breakno].state = BP_SET;
-	kgdb_break[breakno].type = BP_BREAKPOINT;
-	kgdb_break[breakno].bpt_addr = addr;
-
-	return 0;
-}
-
-static int kgdb_deactivate_sw_breakpoints(void)
-{
-	unsigned long addr;
-	int error;
-	int ret = 0;
-	int i;
-
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if (kgdb_break[i].state != BP_ACTIVE)
-			continue;
-		addr = kgdb_break[i].bpt_addr;
-		error = kgdb_arch_remove_breakpoint(addr,
-					kgdb_break[i].saved_instr);
-		if (error) {
-			printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
-			ret = error;
-		}
-
-		kgdb_flush_swbreak_addr(addr);
-		kgdb_break[i].state = BP_SET;
-	}
-	return ret;
-}
-
-static int kgdb_remove_sw_break(unsigned long addr)
-{
-	int i;
-
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if ((kgdb_break[i].state == BP_SET) &&
-				(kgdb_break[i].bpt_addr == addr)) {
-			kgdb_break[i].state = BP_REMOVED;
-			return 0;
-		}
-	}
-	return -ENOENT;
-}
-
-int kgdb_isremovedbreak(unsigned long addr)
-{
-	int i;
-
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if ((kgdb_break[i].state == BP_REMOVED) &&
-					(kgdb_break[i].bpt_addr == addr))
-			return 1;
-	}
-	return 0;
-}
-
-static int remove_all_break(void)
-{
-	unsigned long addr;
-	int error;
-	int i;
-
-	/* Clear memory breakpoints. */
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if (kgdb_break[i].state != BP_ACTIVE)
-			goto setundefined;
-		addr = kgdb_break[i].bpt_addr;
-		error = kgdb_arch_remove_breakpoint(addr,
-				kgdb_break[i].saved_instr);
-		if (error)
-			printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
-			   addr);
-setundefined:
-		kgdb_break[i].state = BP_UNDEFINED;
-	}
-
-	/* Clear hardware breakpoints. */
-	if (arch_kgdb_ops.remove_all_hw_break)
-		arch_kgdb_ops.remove_all_hw_break();
-
-	return 0;
-}
-
-/*
- * Remap normal tasks to their real PID,
- * CPU shadow threads are mapped to -CPU - 2
- */
-static inline int shadow_pid(int realpid)
-{
-	if (realpid)
-		return realpid;
-
-	return -raw_smp_processor_id() - 2;
-}
-
-static char gdbmsgbuf[BUFMAX + 1];
-
-static void kgdb_msg_write(const char *s, int len)
-{
-	char *bufptr;
-	int wcount;
-	int i;
-
-	/* 'O'utput */
-	gdbmsgbuf[0] = 'O';
-
-	/* Fill and send buffers... */
-	while (len > 0) {
-		bufptr = gdbmsgbuf + 1;
-
-		/* Calculate how many this time */
-		if ((len << 1) > (BUFMAX - 2))
-			wcount = (BUFMAX - 2) >> 1;
-		else
-			wcount = len;
-
-		/* Pack in hex chars */
-		for (i = 0; i < wcount; i++)
-			bufptr = pack_hex_byte(bufptr, s[i]);
-		*bufptr = '\0';
-
-		/* Move up */
-		s += wcount;
-		len -= wcount;
-
-		/* Write packet */
-		put_packet(gdbmsgbuf);
-	}
-}
-
-/*
- * Return true if there is a valid kgdb I/O module.  Also if no
- * debugger is attached a message can be printed to the console about
- * waiting for the debugger to attach.
- *
- * The print_wait argument is only to be true when called from inside
- * the core kgdb_handle_exception, because it will wait for the
- * debugger to attach.
- */
-static int kgdb_io_ready(int print_wait)
-{
-	if (!kgdb_io_ops)
-		return 0;
-	if (kgdb_connected)
-		return 1;
-	if (atomic_read(&kgdb_setting_breakpoint))
-		return 1;
-	if (print_wait)
-		printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
-	return 1;
-}
-
-/*
- * All the functions that start with gdb_cmd are the various
- * operations to implement the handlers for the gdbserial protocol
- * where KGDB is communicating with an external debugger
- */
-
-/* Handle the '?' status packets */
-static void gdb_cmd_status(struct kgdb_state *ks)
-{
-	/*
-	 * We know that this packet is only sent
-	 * during initial connect.  So to be safe,
-	 * we clear out our breakpoints now in case
-	 * GDB is reconnecting.
-	 */
-	remove_all_break();
-
-	remcom_out_buffer[0] = 'S';
-	pack_hex_byte(&remcom_out_buffer[1], ks->signo);
-}
-
-/* Handle the 'g' get registers request */
-static void gdb_cmd_getregs(struct kgdb_state *ks)
-{
-	struct task_struct *thread;
-	void *local_debuggerinfo;
-	int i;
-
-	thread = kgdb_usethread;
-	if (!thread) {
-		thread = kgdb_info[ks->cpu].task;
-		local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
-	} else {
-		local_debuggerinfo = NULL;
-		for_each_online_cpu(i) {
-			/*
-			 * Try to find the task on some other
-			 * or possibly this node if we do not
-			 * find the matching task then we try
-			 * to approximate the results.
-			 */
-			if (thread == kgdb_info[i].task)
-				local_debuggerinfo = kgdb_info[i].debuggerinfo;
-		}
-	}
-
-	/*
-	 * All threads that don't have debuggerinfo should be
-	 * in schedule() sleeping, since all other CPUs
-	 * are in kgdb_wait, and thus have debuggerinfo.
-	 */
-	if (local_debuggerinfo) {
-		pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
-	} else {
-		/*
-		 * Pull stuff saved during switch_to; nothing
-		 * else is accessible (or even particularly
-		 * relevant).
-		 *
-		 * This should be enough for a stack trace.
-		 */
-		sleeping_thread_to_gdb_regs(gdb_regs, thread);
-	}
-	kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
-}
-
-/* Handle the 'G' set registers request */
-static void gdb_cmd_setregs(struct kgdb_state *ks)
-{
-	kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
-
-	if (kgdb_usethread && kgdb_usethread != current) {
-		error_packet(remcom_out_buffer, -EINVAL);
-	} else {
-		gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
-		strcpy(remcom_out_buffer, "OK");
-	}
-}
-
-/* Handle the 'm' memory read bytes */
-static void gdb_cmd_memread(struct kgdb_state *ks)
-{
-	char *ptr = &remcom_in_buffer[1];
-	unsigned long length;
-	unsigned long addr;
-	int err;
-
-	if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
-					kgdb_hex2long(&ptr, &length) > 0) {
-		err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
-		if (err)
-			error_packet(remcom_out_buffer, err);
-	} else {
-		error_packet(remcom_out_buffer, -EINVAL);
-	}
-}
-
-/* Handle the 'M' memory write bytes */
-static void gdb_cmd_memwrite(struct kgdb_state *ks)
-{
-	int err = write_mem_msg(0);
-
-	if (err)
-		error_packet(remcom_out_buffer, err);
-	else
-		strcpy(remcom_out_buffer, "OK");
-}
-
-/* Handle the 'X' memory binary write bytes */
-static void gdb_cmd_binwrite(struct kgdb_state *ks)
-{
-	int err = write_mem_msg(1);
-
-	if (err)
-		error_packet(remcom_out_buffer, err);
-	else
-		strcpy(remcom_out_buffer, "OK");
-}
-
-/* Handle the 'D' or 'k', detach or kill packets */
-static void gdb_cmd_detachkill(struct kgdb_state *ks)
-{
-	int error;
-
-	/* The detach case */
-	if (remcom_in_buffer[0] == 'D') {
-		error = remove_all_break();
-		if (error < 0) {
-			error_packet(remcom_out_buffer, error);
-		} else {
-			strcpy(remcom_out_buffer, "OK");
-			kgdb_connected = 0;
-		}
-		put_packet(remcom_out_buffer);
-	} else {
-		/*
-		 * Assume the kill case, with no exit code checking,
-		 * trying to force detach the debugger:
-		 */
-		remove_all_break();
-		kgdb_connected = 0;
-	}
-}
-
-/* Handle the 'R' reboot packets */
-static int gdb_cmd_reboot(struct kgdb_state *ks)
-{
-	/* For now, only honor R0 */
-	if (strcmp(remcom_in_buffer, "R0") == 0) {
-		printk(KERN_CRIT "Executing emergency reboot\n");
-		strcpy(remcom_out_buffer, "OK");
-		put_packet(remcom_out_buffer);
-
-		/*
-		 * Execution should not return from
-		 * machine_emergency_restart()
-		 */
-		machine_emergency_restart();
-		kgdb_connected = 0;
-
-		return 1;
-	}
-	return 0;
-}
-
-/* Handle the 'q' query packets */
-static void gdb_cmd_query(struct kgdb_state *ks)
-{
-	struct task_struct *g;
-	struct task_struct *p;
-	unsigned char thref[8];
-	char *ptr;
-	int i;
-	int cpu;
-	int finished = 0;
-
-	switch (remcom_in_buffer[1]) {
-	case 's':
-	case 'f':
-		if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
-			error_packet(remcom_out_buffer, -EINVAL);
-			break;
-		}
-
-		i = 0;
-		remcom_out_buffer[0] = 'm';
-		ptr = remcom_out_buffer + 1;
-		if (remcom_in_buffer[1] == 'f') {
-			/* Each cpu is a shadow thread */
-			for_each_online_cpu(cpu) {
-				ks->thr_query = 0;
-				int_to_threadref(thref, -cpu - 2);
-				pack_threadid(ptr, thref);
-				ptr += BUF_THREAD_ID_SIZE;
-				*(ptr++) = ',';
-				i++;
-			}
-		}
-
-		do_each_thread(g, p) {
-			if (i >= ks->thr_query && !finished) {
-				int_to_threadref(thref, p->pid);
-				pack_threadid(ptr, thref);
-				ptr += BUF_THREAD_ID_SIZE;
-				*(ptr++) = ',';
-				ks->thr_query++;
-				if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
-					finished = 1;
-			}
-			i++;
-		} while_each_thread(g, p);
-
-		*(--ptr) = '\0';
-		break;
-
-	case 'C':
-		/* Current thread id */
-		strcpy(remcom_out_buffer, "QC");
-		ks->threadid = shadow_pid(current->pid);
-		int_to_threadref(thref, ks->threadid);
-		pack_threadid(remcom_out_buffer + 2, thref);
-		break;
-	case 'T':
-		if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
-			error_packet(remcom_out_buffer, -EINVAL);
-			break;
-		}
-		ks->threadid = 0;
-		ptr = remcom_in_buffer + 17;
-		kgdb_hex2long(&ptr, &ks->threadid);
-		if (!getthread(ks->linux_regs, ks->threadid)) {
-			error_packet(remcom_out_buffer, -EINVAL);
-			break;
-		}
-		if ((int)ks->threadid > 0) {
-			kgdb_mem2hex(getthread(ks->linux_regs,
-					ks->threadid)->comm,
-					remcom_out_buffer, 16);
-		} else {
-			static char tmpstr[23 + BUF_THREAD_ID_SIZE];
-
-			sprintf(tmpstr, "shadowCPU%d",
-					(int)(-ks->threadid - 2));
-			kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
-		}
-		break;
-	}
-}
-
-/* Handle the 'H' task query packets */
-static void gdb_cmd_task(struct kgdb_state *ks)
-{
-	struct task_struct *thread;
-	char *ptr;
-
-	switch (remcom_in_buffer[1]) {
-	case 'g':
-		ptr = &remcom_in_buffer[2];
-		kgdb_hex2long(&ptr, &ks->threadid);
-		thread = getthread(ks->linux_regs, ks->threadid);
-		if (!thread && ks->threadid > 0) {
-			error_packet(remcom_out_buffer, -EINVAL);
-			break;
-		}
-		kgdb_usethread = thread;
-		ks->kgdb_usethreadid = ks->threadid;
-		strcpy(remcom_out_buffer, "OK");
-		break;
-	case 'c':
-		ptr = &remcom_in_buffer[2];
-		kgdb_hex2long(&ptr, &ks->threadid);
-		if (!ks->threadid) {
-			kgdb_contthread = NULL;
-		} else {
-			thread = getthread(ks->linux_regs, ks->threadid);
-			if (!thread && ks->threadid > 0) {
-				error_packet(remcom_out_buffer, -EINVAL);
-				break;
-			}
-			kgdb_contthread = thread;
-		}
-		strcpy(remcom_out_buffer, "OK");
-		break;
-	}
-}
-
-/* Handle the 'T' thread query packets */
-static void gdb_cmd_thread(struct kgdb_state *ks)
-{
-	char *ptr = &remcom_in_buffer[1];
-	struct task_struct *thread;
-
-	kgdb_hex2long(&ptr, &ks->threadid);
-	thread = getthread(ks->linux_regs, ks->threadid);
-	if (thread)
-		strcpy(remcom_out_buffer, "OK");
-	else
-		error_packet(remcom_out_buffer, -EINVAL);
-}
-
-/* Handle the 'z' or 'Z' breakpoint remove or set packets */
-static void gdb_cmd_break(struct kgdb_state *ks)
-{
-	/*
-	 * Since GDB-5.3, it's been drafted that '0' is a software
-	 * breakpoint, '1' is a hardware breakpoint, so let's do that.
-	 */
-	char *bpt_type = &remcom_in_buffer[1];
-	char *ptr = &remcom_in_buffer[2];
-	unsigned long addr;
-	unsigned long length;
-	int error = 0;
-
-	if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
-		/* Unsupported */
-		if (*bpt_type > '4')
-			return;
-	} else {
-		if (*bpt_type != '0' && *bpt_type != '1')
-			/* Unsupported. */
-			return;
-	}
-
-	/*
-	 * Test if this is a hardware breakpoint, and
-	 * if we support it:
-	 */
-	if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
-		/* Unsupported. */
-		return;
-
-	if (*(ptr++) != ',') {
-		error_packet(remcom_out_buffer, -EINVAL);
-		return;
-	}
-	if (!kgdb_hex2long(&ptr, &addr)) {
-		error_packet(remcom_out_buffer, -EINVAL);
-		return;
-	}
-	if (*(ptr++) != ',' ||
-		!kgdb_hex2long(&ptr, &length)) {
-		error_packet(remcom_out_buffer, -EINVAL);
-		return;
-	}
-
-	if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
-		error = kgdb_set_sw_break(addr);
-	else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
-		error = kgdb_remove_sw_break(addr);
-	else if (remcom_in_buffer[0] == 'Z')
-		error = arch_kgdb_ops.set_hw_breakpoint(addr,
-			(int)length, *bpt_type - '0');
-	else if (remcom_in_buffer[0] == 'z')
-		error = arch_kgdb_ops.remove_hw_breakpoint(addr,
-			(int) length, *bpt_type - '0');
-
-	if (error == 0)
-		strcpy(remcom_out_buffer, "OK");
-	else
-		error_packet(remcom_out_buffer, error);
-}
-
-/* Handle the 'C' signal / exception passing packets */
-static int gdb_cmd_exception_pass(struct kgdb_state *ks)
-{
-	/* C09 == pass exception
-	 * C15 == detach kgdb, pass exception
-	 */
-	if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
-
-		ks->pass_exception = 1;
-		remcom_in_buffer[0] = 'c';
-
-	} else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
-
-		ks->pass_exception = 1;
-		remcom_in_buffer[0] = 'D';
-		remove_all_break();
-		kgdb_connected = 0;
-		return 1;
-
-	} else {
-		kgdb_msg_write("KGDB only knows signal 9 (pass)"
-			" and 15 (pass and disconnect)\n"
-			"Executing a continue without signal passing\n", 0);
-		remcom_in_buffer[0] = 'c';
-	}
-
-	/* Indicate fall through */
-	return -1;
-}
-
-/*
- * This function performs all gdbserial command procesing
- */
-static int gdb_serial_stub(struct kgdb_state *ks)
-{
-	int error = 0;
-	int tmp;
-
-	/* Clear the out buffer. */
-	memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
-
-	if (kgdb_connected) {
-		unsigned char thref[8];
-		char *ptr;
-
-		/* Reply to host that an exception has occurred */
-		ptr = remcom_out_buffer;
-		*ptr++ = 'T';
-		ptr = pack_hex_byte(ptr, ks->signo);
-		ptr += strlen(strcpy(ptr, "thread:"));
-		int_to_threadref(thref, shadow_pid(current->pid));
-		ptr = pack_threadid(ptr, thref);
-		*ptr++ = ';';
-		put_packet(remcom_out_buffer);
-	}
-
-	kgdb_usethread = kgdb_info[ks->cpu].task;
-	ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
-	ks->pass_exception = 0;
-
-	while (1) {
-		error = 0;
-
-		/* Clear the out buffer. */
-		memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
-
-		get_packet(remcom_in_buffer);
-
-		switch (remcom_in_buffer[0]) {
-		case '?': /* gdbserial status */
-			gdb_cmd_status(ks);
-			break;
-		case 'g': /* return the value of the CPU registers */
-			gdb_cmd_getregs(ks);
-			break;
-		case 'G': /* set the value of the CPU registers - return OK */
-			gdb_cmd_setregs(ks);
-			break;
-		case 'm': /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
-			gdb_cmd_memread(ks);
-			break;
-		case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
-			gdb_cmd_memwrite(ks);
-			break;
-		case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
-			gdb_cmd_binwrite(ks);
-			break;
-			/* kill or detach. KGDB should treat this like a
-			 * continue.
-			 */
-		case 'D': /* Debugger detach */
-		case 'k': /* Debugger detach via kill */
-			gdb_cmd_detachkill(ks);
-			goto default_handle;
-		case 'R': /* Reboot */
-			if (gdb_cmd_reboot(ks))
-				goto default_handle;
-			break;
-		case 'q': /* query command */
-			gdb_cmd_query(ks);
-			break;
-		case 'H': /* task related */
-			gdb_cmd_task(ks);
-			break;
-		case 'T': /* Query thread status */
-			gdb_cmd_thread(ks);
-			break;
-		case 'z': /* Break point remove */
-		case 'Z': /* Break point set */
-			gdb_cmd_break(ks);
-			break;
-		case 'C': /* Exception passing */
-			tmp = gdb_cmd_exception_pass(ks);
-			if (tmp > 0)
-				goto default_handle;
-			if (tmp == 0)
-				break;
-			/* Fall through on tmp < 0 */
-		case 'c': /* Continue packet */
-		case 's': /* Single step packet */
-			if (kgdb_contthread && kgdb_contthread != current) {
-				/* Can't switch threads in kgdb */
-				error_packet(remcom_out_buffer, -EINVAL);
-				break;
-			}
-			kgdb_activate_sw_breakpoints();
-			/* Fall through to default processing */
-		default:
-default_handle:
-			error = kgdb_arch_handle_exception(ks->ex_vector,
-						ks->signo,
-						ks->err_code,
-						remcom_in_buffer,
-						remcom_out_buffer,
-						ks->linux_regs);
-			/*
-			 * Leave cmd processing on error, detach,
-			 * kill, continue, or single step.
-			 */
-			if (error >= 0 || remcom_in_buffer[0] == 'D' ||
-			    remcom_in_buffer[0] == 'k') {
-				error = 0;
-				goto kgdb_exit;
-			}
-
-		}
-
-		/* reply to the request */
-		put_packet(remcom_out_buffer);
-	}
-
-kgdb_exit:
-	if (ks->pass_exception)
-		error = 1;
-	return error;
-}
-
-static int kgdb_reenter_check(struct kgdb_state *ks)
-{
-	unsigned long addr;
-
-	if (atomic_read(&kgdb_active) != raw_smp_processor_id())
-		return 0;
-
-	/* Panic on recursive debugger calls: */
-	exception_level++;
-	addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
-	kgdb_deactivate_sw_breakpoints();
-
-	/*
-	 * If the break point removed ok at the place exception
-	 * occurred, try to recover and print a warning to the end
-	 * user because the user planted a breakpoint in a place that
-	 * KGDB needs in order to function.
-	 */
-	if (kgdb_remove_sw_break(addr) == 0) {
-		exception_level = 0;
-		kgdb_skipexception(ks->ex_vector, ks->linux_regs);
-		kgdb_activate_sw_breakpoints();
-		printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
-			addr);
-		WARN_ON_ONCE(1);
-
-		return 1;
-	}
-	remove_all_break();
-	kgdb_skipexception(ks->ex_vector, ks->linux_regs);
-
-	if (exception_level > 1) {
-		dump_stack();
-		panic("Recursive entry to debugger");
-	}
-
-	printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
-	dump_stack();
-	panic("Recursive entry to debugger");
-
-	return 1;
-}
-
-/*
- * kgdb_handle_exception() - main entry point from a kernel exception
- *
- * Locking hierarchy:
- *	interface locks, if any (begin_session)
- *	kgdb lock (kgdb_active)
- */
-int
-kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
-{
-	struct kgdb_state kgdb_var;
-	struct kgdb_state *ks = &kgdb_var;
-	unsigned long flags;
-	int sstep_tries = 100;
-	int error = 0;
-	int i, cpu;
-
-	ks->cpu			= raw_smp_processor_id();
-	ks->ex_vector		= evector;
-	ks->signo		= signo;
-	ks->ex_vector		= evector;
-	ks->err_code		= ecode;
-	ks->kgdb_usethreadid	= 0;
-	ks->linux_regs		= regs;
-
-	if (kgdb_reenter_check(ks))
-		return 0; /* Ouch, double exception ! */
-
-acquirelock:
-	/*
-	 * Interrupts will be restored by the 'trap return' code, except when
-	 * single stepping.
-	 */
-	local_irq_save(flags);
-
-	cpu = raw_smp_processor_id();
-
-	/*
-	 * Acquire the kgdb_active lock:
-	 */
-	while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1)
-		cpu_relax();
-
-	/*
-	 * For single stepping, try to only enter on the processor
-	 * that was single stepping.  To gaurd against a deadlock, the
-	 * kernel will only try for the value of sstep_tries before
-	 * giving up and continuing on.
-	 */
-	if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
-	    (kgdb_info[cpu].task &&
-	     kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
-		atomic_set(&kgdb_active, -1);
-		touch_softlockup_watchdog();
-		clocksource_touch_watchdog();
-		local_irq_restore(flags);
-
-		goto acquirelock;
-	}
-
-	if (!kgdb_io_ready(1)) {
-		error = 1;
-		goto kgdb_restore; /* No I/O connection, so resume the system */
-	}
-
-	/*
-	 * Don't enter if we have hit a removed breakpoint.
-	 */
-	if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
-		goto kgdb_restore;
-
-	/* Call the I/O driver's pre_exception routine */
-	if (kgdb_io_ops->pre_exception)
-		kgdb_io_ops->pre_exception();
-
-	kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
-	kgdb_info[ks->cpu].task = current;
-
-	kgdb_disable_hw_debug(ks->linux_regs);
-
-	/*
-	 * Get the passive CPU lock which will hold all the non-primary
-	 * CPU in a spin state while the debugger is active
-	 */
-	if (!kgdb_single_step) {
-		for (i = 0; i < NR_CPUS; i++)
-			atomic_set(&passive_cpu_wait[i], 1);
-	}
-
-	/*
-	 * spin_lock code is good enough as a barrier so we don't
-	 * need one here:
-	 */
-	atomic_set(&cpu_in_kgdb[ks->cpu], 1);
-
-#ifdef CONFIG_SMP
-	/* Signal the other CPUs to enter kgdb_wait() */
-	if ((!kgdb_single_step) && kgdb_do_roundup)
-		kgdb_roundup_cpus(flags);
-#endif
-
-	/*
-	 * Wait for the other CPUs to be notified and be waiting for us:
-	 */
-	for_each_online_cpu(i) {
-		while (!atomic_read(&cpu_in_kgdb[i]))
-			cpu_relax();
-	}
-
-	/*
-	 * At this point the primary processor is completely
-	 * in the debugger and all secondary CPUs are quiescent
-	 */
-	kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
-	kgdb_deactivate_sw_breakpoints();
-	kgdb_single_step = 0;
-	kgdb_contthread = current;
-	exception_level = 0;
-
-	/* Talk to debugger with gdbserial protocol */
-	error = gdb_serial_stub(ks);
-
-	/* Call the I/O driver's post_exception routine */
-	if (kgdb_io_ops->post_exception)
-		kgdb_io_ops->post_exception();
-
-	kgdb_info[ks->cpu].debuggerinfo = NULL;
-	kgdb_info[ks->cpu].task = NULL;
-	atomic_set(&cpu_in_kgdb[ks->cpu], 0);
-
-	if (!kgdb_single_step) {
-		for (i = NR_CPUS-1; i >= 0; i--)
-			atomic_set(&passive_cpu_wait[i], 0);
-		/*
-		 * Wait till all the CPUs have quit
-		 * from the debugger.
-		 */
-		for_each_online_cpu(i) {
-			while (atomic_read(&cpu_in_kgdb[i]))
-				cpu_relax();
-		}
-	}
-
-kgdb_restore:
-	if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
-		int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
-		if (kgdb_info[sstep_cpu].task)
-			kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
-		else
-			kgdb_sstep_pid = 0;
-	}
-	/* Free kgdb_active */
-	atomic_set(&kgdb_active, -1);
-	touch_softlockup_watchdog();
-	clocksource_touch_watchdog();
-	local_irq_restore(flags);
-
-	return error;
-}
-
-int kgdb_nmicallback(int cpu, void *regs)
-{
-#ifdef CONFIG_SMP
-	if (!atomic_read(&cpu_in_kgdb[cpu]) &&
-			atomic_read(&kgdb_active) != cpu &&
-			atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) {
-		kgdb_wait((struct pt_regs *)regs);
-		return 0;
-	}
-#endif
-	return 1;
-}
-
-static void kgdb_console_write(struct console *co, const char *s,
-   unsigned count)
-{
-	unsigned long flags;
-
-	/* If we're debugging, or KGDB has not connected, don't try
-	 * and print. */
-	if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
-		return;
-
-	local_irq_save(flags);
-	kgdb_msg_write(s, count);
-	local_irq_restore(flags);
-}
-
-static struct console kgdbcons = {
-	.name		= "kgdb",
-	.write		= kgdb_console_write,
-	.flags		= CON_PRINTBUFFER | CON_ENABLED,
-	.index		= -1,
-};
-
-#ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handle_gdb(int key, struct tty_struct *tty)
-{
-	if (!kgdb_io_ops) {
-		printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
-		return;
-	}
-	if (!kgdb_connected)
-		printk(KERN_CRIT "Entering KGDB\n");
-
-	kgdb_breakpoint();
-}
-
-static struct sysrq_key_op sysrq_gdb_op = {
-	.handler	= sysrq_handle_gdb,
-	.help_msg	= "debug(G)",
-	.action_msg	= "DEBUG",
-};
-#endif
-
-static void kgdb_register_callbacks(void)
-{
-	if (!kgdb_io_module_registered) {
-		kgdb_io_module_registered = 1;
-		kgdb_arch_init();
-#ifdef CONFIG_MAGIC_SYSRQ
-		register_sysrq_key('g', &sysrq_gdb_op);
-#endif
-		if (kgdb_use_con && !kgdb_con_registered) {
-			register_console(&kgdbcons);
-			kgdb_con_registered = 1;
-		}
-	}
-}
-
-static void kgdb_unregister_callbacks(void)
-{
-	/*
-	 * When this routine is called KGDB should unregister from the
-	 * panic handler and clean up, making sure it is not handling any
-	 * break exceptions at the time.
-	 */
-	if (kgdb_io_module_registered) {
-		kgdb_io_module_registered = 0;
-		kgdb_arch_exit();
-#ifdef CONFIG_MAGIC_SYSRQ
-		unregister_sysrq_key('g', &sysrq_gdb_op);
-#endif
-		if (kgdb_con_registered) {
-			unregister_console(&kgdbcons);
-			kgdb_con_registered = 0;
-		}
-	}
-}
-
-static void kgdb_initial_breakpoint(void)
-{
-	kgdb_break_asap = 0;
-
-	printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
-	kgdb_breakpoint();
-}
-
-/**
- *	kgdb_register_io_module - register KGDB IO module
- *	@new_kgdb_io_ops: the io ops vector
- *
- *	Register it with the KGDB core.
- */
-int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
-{
-	int err;
-
-	spin_lock(&kgdb_registration_lock);
-
-	if (kgdb_io_ops) {
-		spin_unlock(&kgdb_registration_lock);
-
-		printk(KERN_ERR "kgdb: Another I/O driver is already "
-				"registered with KGDB.\n");
-		return -EBUSY;
-	}
-
-	if (new_kgdb_io_ops->init) {
-		err = new_kgdb_io_ops->init();
-		if (err) {
-			spin_unlock(&kgdb_registration_lock);
-			return err;
-		}
-	}
-
-	kgdb_io_ops = new_kgdb_io_ops;
-
-	spin_unlock(&kgdb_registration_lock);
-
-	printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
-	       new_kgdb_io_ops->name);
-
-	/* Arm KGDB now. */
-	kgdb_register_callbacks();
-
-	if (kgdb_break_asap)
-		kgdb_initial_breakpoint();
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kgdb_register_io_module);
-
-/**
- *	kkgdb_unregister_io_module - unregister KGDB IO module
- *	@old_kgdb_io_ops: the io ops vector
- *
- *	Unregister it with the KGDB core.
- */
-void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
-{
-	BUG_ON(kgdb_connected);
-
-	/*
-	 * KGDB is no longer able to communicate out, so
-	 * unregister our callbacks and reset state.
-	 */
-	kgdb_unregister_callbacks();
-
-	spin_lock(&kgdb_registration_lock);
-
-	WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
-	kgdb_io_ops = NULL;
-
-	spin_unlock(&kgdb_registration_lock);
-
-	printk(KERN_INFO
-		"kgdb: Unregistered I/O driver %s, debugger disabled.\n",
-		old_kgdb_io_ops->name);
-}
-EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
-
-/**
- * kgdb_breakpoint - generate breakpoint exception
- *
- * This function will generate a breakpoint exception.  It is used at the
- * beginning of a program to sync up with a debugger and can be used
- * otherwise as a quick means to stop program execution and "break" into
- * the debugger.
- */
-void kgdb_breakpoint(void)
-{
-	atomic_set(&kgdb_setting_breakpoint, 1);
-	wmb(); /* Sync point before breakpoint */
-	arch_kgdb_breakpoint();
-	wmb(); /* Sync point after breakpoint */
-	atomic_set(&kgdb_setting_breakpoint, 0);
-}
-EXPORT_SYMBOL_GPL(kgdb_breakpoint);
-
-static int __init opt_kgdb_wait(char *str)
-{
-	kgdb_break_asap = 1;
-
-	if (kgdb_io_module_registered)
-		kgdb_initial_breakpoint();
-
-	return 0;
-}
-
-early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bf0e231d970..9cd0591c96a 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -116,27 +116,16 @@ int __request_module(bool wait, const char *fmt, ...)
 
 	trace_module_request(module_name, wait, _RET_IP_);
 
-	ret = call_usermodehelper(modprobe_path, argv, envp,
-			wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
+	ret = call_usermodehelper_fns(modprobe_path, argv, envp,
+			wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
+			NULL, NULL, NULL);
+
 	atomic_dec(&kmod_concurrent);
 	return ret;
 }
 EXPORT_SYMBOL(__request_module);
 #endif /* CONFIG_MODULES */
 
-struct subprocess_info {
-	struct work_struct work;
-	struct completion *complete;
-	struct cred *cred;
-	char *path;
-	char **argv;
-	char **envp;
-	enum umh_wait wait;
-	int retval;
-	struct file *stdin;
-	void (*cleanup)(char **argv, char **envp);
-};
-
 /*
  * This is the task which runs the usermode application
  */
@@ -145,36 +134,10 @@ static int ____call_usermodehelper(void *data)
 	struct subprocess_info *sub_info = data;
 	int retval;
 
-	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
-
-	/* Unblock all signals */
 	spin_lock_irq(&current->sighand->siglock);
 	flush_signal_handlers(current, 1);
-	sigemptyset(&current->blocked);
-	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	/* Install the credentials */
-	commit_creds(sub_info->cred);
-	sub_info->cred = NULL;
-
-	/* Install input pipe when needed */
-	if (sub_info->stdin) {
-		struct files_struct *f = current->files;
-		struct fdtable *fdt;
-		/* no races because files should be private here */
-		sys_close(0);
-		fd_install(0, sub_info->stdin);
-		spin_lock(&f->file_lock);
-		fdt = files_fdtable(f);
-		FD_SET(0, fdt->open_fds);
-		FD_CLR(0, fdt->close_on_exec);
-		spin_unlock(&f->file_lock);
-
-		/* and disallow core files too */
-		current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
-	}
-
 	/* We can run anywhere, unlike our parent keventd(). */
 	set_cpus_allowed_ptr(current, cpu_all_mask);
 
@@ -184,9 +147,18 @@ static int ____call_usermodehelper(void *data)
 	 */
 	set_user_nice(current, 0);
 
-	retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
+	if (sub_info->init) {
+		retval = sub_info->init(sub_info);
+		if (retval)
+			goto fail;
+	}
+
+	retval = kernel_execve(sub_info->path,
+			       (const char *const *)sub_info->argv,
+			       (const char *const *)sub_info->envp);
 
 	/* Exec failed? */
+fail:
 	sub_info->retval = retval;
 	do_exit(0);
 }
@@ -194,9 +166,7 @@ static int ____call_usermodehelper(void *data)
 void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
 	if (info->cleanup)
-		(*info->cleanup)(info->argv, info->envp);
-	if (info->cred)
-		put_cred(info->cred);
+		(*info->cleanup)(info);
 	kfree(info);
 }
 EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -207,16 +177,16 @@ static int wait_for_helper(void *data)
 	struct subprocess_info *sub_info = data;
 	pid_t pid;
 
-	/* Install a handler: if SIGCLD isn't handled sys_wait4 won't
-	 * populate the status, but will return -ECHILD. */
-	allow_signal(SIGCHLD);
+	/* If SIGCLD is ignored sys_wait4 won't populate the status. */
+	spin_lock_irq(&current->sighand->siglock);
+	current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
+	spin_unlock_irq(&current->sighand->siglock);
 
 	pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
 	if (pid < 0) {
 		sub_info->retval = pid;
 	} else {
-		int ret;
-
+		int ret = -ECHILD;
 		/*
 		 * Normally it is bogus to call wait4() from in-kernel because
 		 * wait4() wants to write the exit code to a userspace address.
@@ -237,10 +207,7 @@ static int wait_for_helper(void *data)
 			sub_info->retval = ret;
 	}
 
-	if (sub_info->wait == UMH_NO_WAIT)
-		call_usermodehelper_freeinfo(sub_info);
-	else
-		complete(sub_info->complete);
+	complete(sub_info->complete);
 	return 0;
 }
 
@@ -249,15 +216,13 @@ static void __call_usermodehelper(struct work_struct *work)
 {
 	struct subprocess_info *sub_info =
 		container_of(work, struct subprocess_info, work);
-	pid_t pid;
 	enum umh_wait wait = sub_info->wait;
-
-	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+	pid_t pid;
 
 	/* CLONE_VFORK: wait until the usermode helper has execve'd
 	 * successfully We need the data structures to stay around
 	 * until that is done.  */
-	if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
+	if (wait == UMH_WAIT_PROC)
 		pid = kernel_thread(wait_for_helper, sub_info,
 				    CLONE_FS | CLONE_FILES | SIGCHLD);
 	else
@@ -266,15 +231,16 @@ static void __call_usermodehelper(struct work_struct *work)
 
 	switch (wait) {
 	case UMH_NO_WAIT:
+		call_usermodehelper_freeinfo(sub_info);
 		break;
 
 	case UMH_WAIT_PROC:
 		if (pid > 0)
 			break;
-		sub_info->retval = pid;
 		/* FALLTHROUGH */
-
 	case UMH_WAIT_EXEC:
+		if (pid < 0)
+			sub_info->retval = pid;
 		complete(sub_info->complete);
 	}
 }
@@ -376,80 +342,37 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 	sub_info->path = path;
 	sub_info->argv = argv;
 	sub_info->envp = envp;
-	sub_info->cred = prepare_usermodehelper_creds();
-	if (!sub_info->cred) {
-		kfree(sub_info);
-		return NULL;
-	}
-
   out:
 	return sub_info;
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
 /**
- * call_usermodehelper_setkeys - set the session keys for usermode helper
- * @info: a subprocess_info returned by call_usermodehelper_setup
- * @session_keyring: the session keyring for the process
- */
-void call_usermodehelper_setkeys(struct subprocess_info *info,
-				 struct key *session_keyring)
-{
-#ifdef CONFIG_KEYS
-	struct thread_group_cred *tgcred = info->cred->tgcred;
-	key_put(tgcred->session_keyring);
-	tgcred->session_keyring = key_get(session_keyring);
-#else
-	BUG();
-#endif
-}
-EXPORT_SYMBOL(call_usermodehelper_setkeys);
-
-/**
- * call_usermodehelper_setcleanup - set a cleanup function
+ * call_usermodehelper_setfns - set a cleanup/init function
  * @info: a subprocess_info returned by call_usermodehelper_setup
  * @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
  *
- * The cleanup function is just befor ethe subprocess_info is about to
+ * The init function is used to customize the helper process prior to
+ * exec.  A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
  * be freed.  This can be used for freeing the argv and envp.  The
  * Function must be runnable in either a process context or the
  * context in which call_usermodehelper_exec is called.
  */
-void call_usermodehelper_setcleanup(struct subprocess_info *info,
-				    void (*cleanup)(char **argv, char **envp))
+void call_usermodehelper_setfns(struct subprocess_info *info,
+		    int (*init)(struct subprocess_info *info),
+		    void (*cleanup)(struct subprocess_info *info),
+		    void *data)
 {
 	info->cleanup = cleanup;
+	info->init = init;
+	info->data = data;
 }
-EXPORT_SYMBOL(call_usermodehelper_setcleanup);
-
-/**
- * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
- * @sub_info: a subprocess_info returned by call_usermodehelper_setup
- * @filp: set to the write-end of a pipe
- *
- * This constructs a pipe, and sets the read end to be the stdin of the
- * subprocess, and returns the write-end in *@filp.
- */
-int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
-				  struct file **filp)
-{
-	struct file *f;
-
-	f = create_write_pipe(0);
-	if (IS_ERR(f))
-		return PTR_ERR(f);
-	*filp = f;
-
-	f = create_read_pipe(f, 0);
-	if (IS_ERR(f)) {
-		free_write_pipe(*filp);
-		return PTR_ERR(f);
-	}
-	sub_info->stdin = f;
-
-	return 0;
-}
-EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
+EXPORT_SYMBOL(call_usermodehelper_setfns);
 
 /**
  * call_usermodehelper_exec - start a usermode application
@@ -469,9 +392,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval = 0;
 
-	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
-	validate_creds(sub_info->cred);
-
 	helper_lock();
 	if (sub_info->path[0] == '\0')
 		goto out;
@@ -498,41 +418,6 @@ unlock:
 }
 EXPORT_SYMBOL(call_usermodehelper_exec);
 
-/**
- * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
- * @path: path to usermode executable
- * @argv: arg vector for process
- * @envp: environment for process
- * @filp: set to the write-end of a pipe
- *
- * This is a simple wrapper which executes a usermode-helper function
- * with a pipe as stdin.  It is implemented entirely in terms of
- * lower-level call_usermodehelper_* functions.
- */
-int call_usermodehelper_pipe(char *path, char **argv, char **envp,
-			     struct file **filp)
-{
-	struct subprocess_info *sub_info;
-	int ret;
-
-	sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
-	if (sub_info == NULL)
-		return -ENOMEM;
-
-	ret = call_usermodehelper_stdinpipe(sub_info, filp);
-	if (ret < 0) {
-		call_usermodehelper_freeinfo(sub_info);
-		return ret;
-	}
-
-	ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-	if (ret < 0)	/* Failed to execute helper, close pipe */
-		filp_close(*filp, NULL);
-
-	return ret;
-}
-EXPORT_SYMBOL(call_usermodehelper_pipe);
-
 void __init usermodehelper_init(void)
 {
 	khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b7df302a020..282035f3ae9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,8 +42,11 @@
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/sysctl.h>
 #include <linux/kdebug.h>
 #include <linux/memory.h>
+#include <linux/ftrace.h>
+#include <linux/cpu.h>
 
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -93,6 +96,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
 	{"native_get_debugreg",},
 	{"irq_entries_start",},
 	{"common_interrupt",},
+	{"mcount",},	/* mcount can be called from everywhere */
 	{NULL}    /* Terminator */
 };
 
@@ -103,81 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
  * stepping on the instruction on a vmalloced/kmalloced/data page
  * is a recipe for disaster
  */
-#define INSNS_PER_PAGE	(PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
-
 struct kprobe_insn_page {
 	struct list_head list;
 	kprobe_opcode_t *insns;		/* Page of instruction slots */
-	char slot_used[INSNS_PER_PAGE];
 	int nused;
 	int ngarbage;
+	char slot_used[];
+};
+
+#define KPROBE_INSN_PAGE_SIZE(slots)			\
+	(offsetof(struct kprobe_insn_page, slot_used) +	\
+	 (sizeof(char) * (slots)))
+
+struct kprobe_insn_cache {
+	struct list_head pages;	/* list of kprobe_insn_page */
+	size_t insn_size;	/* size of instruction slot */
+	int nr_garbage;
 };
 
+static int slots_per_page(struct kprobe_insn_cache *c)
+{
+	return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
+}
+
 enum kprobe_slot_state {
 	SLOT_CLEAN = 0,
 	SLOT_DIRTY = 1,
 	SLOT_USED = 2,
 };
 
-static DEFINE_MUTEX(kprobe_insn_mutex);	/* Protects kprobe_insn_pages */
-static LIST_HEAD(kprobe_insn_pages);
-static int kprobe_garbage_slots;
-static int collect_garbage_slots(void);
-
-static int __kprobes check_safety(void)
-{
-	int ret = 0;
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
-	ret = freeze_processes();
-	if (ret == 0) {
-		struct task_struct *p, *q;
-		do_each_thread(p, q) {
-			if (p != current && p->state == TASK_RUNNING &&
-			    p->pid != 0) {
-				printk("Check failed: %s is running\n",p->comm);
-				ret = -1;
-				goto loop_end;
-			}
-		} while_each_thread(p, q);
-	}
-loop_end:
-	thaw_processes();
-#else
-	synchronize_sched();
-#endif
-	return ret;
-}
+static DEFINE_MUTEX(kprobe_insn_mutex);	/* Protects kprobe_insn_slots */
+static struct kprobe_insn_cache kprobe_insn_slots = {
+	.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
+	.insn_size = MAX_INSN_SIZE,
+	.nr_garbage = 0,
+};
+static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
 
 /**
  * __get_insn_slot() - Find a slot on an executable page for an instruction.
  * We allocate an executable page if there's no room on existing ones.
  */
-static kprobe_opcode_t __kprobes *__get_insn_slot(void)
+static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 {
 	struct kprobe_insn_page *kip;
 
  retry:
-	list_for_each_entry(kip, &kprobe_insn_pages, list) {
-		if (kip->nused < INSNS_PER_PAGE) {
+	list_for_each_entry(kip, &c->pages, list) {
+		if (kip->nused < slots_per_page(c)) {
 			int i;
-			for (i = 0; i < INSNS_PER_PAGE; i++) {
+			for (i = 0; i < slots_per_page(c); i++) {
 				if (kip->slot_used[i] == SLOT_CLEAN) {
 					kip->slot_used[i] = SLOT_USED;
 					kip->nused++;
-					return kip->insns + (i * MAX_INSN_SIZE);
+					return kip->insns + (i * c->insn_size);
 				}
 			}
-			/* Surprise!  No unused slots.  Fix kip->nused. */
-			kip->nused = INSNS_PER_PAGE;
+			/* kip->nused is broken. Fix it. */
+			kip->nused = slots_per_page(c);
+			WARN_ON(1);
 		}
 	}
 
 	/* If there are any garbage slots, collect it and try again. */
-	if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+	if (c->nr_garbage && collect_garbage_slots(c) == 0)
 		goto retry;
-	}
-	/* All out of space.  Need to allocate a new page. Use slot 0. */
-	kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
+
+	/* All out of space.  Need to allocate a new page. */
+	kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
 	if (!kip)
 		return NULL;
 
@@ -192,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
 		return NULL;
 	}
 	INIT_LIST_HEAD(&kip->list);
-	list_add(&kip->list, &kprobe_insn_pages);
-	memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
+	memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
 	kip->slot_used[0] = SLOT_USED;
 	kip->nused = 1;
 	kip->ngarbage = 0;
+	list_add(&kip->list, &c->pages);
 	return kip->insns;
 }
 
+
 kprobe_opcode_t __kprobes *get_insn_slot(void)
 {
-	kprobe_opcode_t *ret;
+	kprobe_opcode_t *ret = NULL;
+
 	mutex_lock(&kprobe_insn_mutex);
-	ret = __get_insn_slot();
+	ret = __get_insn_slot(&kprobe_insn_slots);
 	mutex_unlock(&kprobe_insn_mutex);
+
 	return ret;
 }
 
@@ -221,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
 		 * so as not to have to set it up again the
 		 * next time somebody inserts a probe.
 		 */
-		if (!list_is_singular(&kprobe_insn_pages)) {
+		if (!list_is_singular(&kip->list)) {
 			list_del(&kip->list);
 			module_free(NULL, kip->insns);
 			kfree(kip);
@@ -231,52 +231,85 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
 	return 0;
 }
 
-static int __kprobes collect_garbage_slots(void)
+static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
 {
 	struct kprobe_insn_page *kip, *next;
 
-	/* Ensure no-one is preepmted on the garbages */
-	if (check_safety())
-		return -EAGAIN;
+	/* Ensure no-one is interrupted on the garbages */
+	synchronize_sched();
 
-	list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
+	list_for_each_entry_safe(kip, next, &c->pages, list) {
 		int i;
 		if (kip->ngarbage == 0)
 			continue;
 		kip->ngarbage = 0;	/* we will collect all garbages */
-		for (i = 0; i < INSNS_PER_PAGE; i++) {
+		for (i = 0; i < slots_per_page(c); i++) {
 			if (kip->slot_used[i] == SLOT_DIRTY &&
 			    collect_one_slot(kip, i))
 				break;
 		}
 	}
-	kprobe_garbage_slots = 0;
+	c->nr_garbage = 0;
 	return 0;
 }
 
-void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
+static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
+				       kprobe_opcode_t *slot, int dirty)
 {
 	struct kprobe_insn_page *kip;
 
-	mutex_lock(&kprobe_insn_mutex);
-	list_for_each_entry(kip, &kprobe_insn_pages, list) {
-		if (kip->insns <= slot &&
-		    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
-			int i = (slot - kip->insns) / MAX_INSN_SIZE;
+	list_for_each_entry(kip, &c->pages, list) {
+		long idx = ((long)slot - (long)kip->insns) /
+				(c->insn_size * sizeof(kprobe_opcode_t));
+		if (idx >= 0 && idx < slots_per_page(c)) {
+			WARN_ON(kip->slot_used[idx] != SLOT_USED);
 			if (dirty) {
-				kip->slot_used[i] = SLOT_DIRTY;
+				kip->slot_used[idx] = SLOT_DIRTY;
 				kip->ngarbage++;
+				if (++c->nr_garbage > slots_per_page(c))
+					collect_garbage_slots(c);
 			} else
-				collect_one_slot(kip, i);
-			break;
+				collect_one_slot(kip, idx);
+			return;
 		}
 	}
+	/* Could not free this slot. */
+	WARN_ON(1);
+}
 
-	if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
-		collect_garbage_slots();
-
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
+{
+	mutex_lock(&kprobe_insn_mutex);
+	__free_insn_slot(&kprobe_insn_slots, slot, dirty);
 	mutex_unlock(&kprobe_insn_mutex);
 }
+#ifdef CONFIG_OPTPROBES
+/* For optimized_kprobe buffer */
+static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
+static struct kprobe_insn_cache kprobe_optinsn_slots = {
+	.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
+	/* .insn_size is initialized later */
+	.nr_garbage = 0,
+};
+/* Get a slot for optimized_kprobe buffer */
+kprobe_opcode_t __kprobes *get_optinsn_slot(void)
+{
+	kprobe_opcode_t *ret = NULL;
+
+	mutex_lock(&kprobe_optinsn_mutex);
+	ret = __get_insn_slot(&kprobe_optinsn_slots);
+	mutex_unlock(&kprobe_optinsn_mutex);
+
+	return ret;
+}
+
+void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
+{
+	mutex_lock(&kprobe_optinsn_mutex);
+	__free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
+	mutex_unlock(&kprobe_optinsn_mutex);
+}
+#endif
 #endif
 
 /* We have preemption disabled.. so it is safe to use __ versions */
@@ -307,23 +340,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
 		if (p->addr == addr)
 			return p;
 	}
+
 	return NULL;
 }
 
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
+
+/* Return true if the kprobe is an aggregator */
+static inline int kprobe_aggrprobe(struct kprobe *p)
+{
+	return p->pre_handler == aggr_pre_handler;
+}
+
+/*
+ * Keep all fields in the kprobe consistent
+ */
+static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+	memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+	memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+}
+
+#ifdef CONFIG_OPTPROBES
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobes_allow_optimization;
+
+/*
+ * Call all pre_handler on the list, but ignores its return value.
+ * This must be called from arch-dep optimized caller.
+ */
+void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kprobe *kp;
+
+	list_for_each_entry_rcu(kp, &p->list, list) {
+		if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
+			set_kprobe_instance(kp);
+			kp->pre_handler(kp, regs);
+		}
+		reset_kprobe_instance();
+	}
+}
+
+/* Return true(!0) if the kprobe is ready for optimization. */
+static inline int kprobe_optready(struct kprobe *p)
+{
+	struct optimized_kprobe *op;
+
+	if (kprobe_aggrprobe(p)) {
+		op = container_of(p, struct optimized_kprobe, kp);
+		return arch_prepared_optinsn(&op->optinsn);
+	}
+
+	return 0;
+}
+
+/*
+ * Return an optimized kprobe whose optimizing code replaces
+ * instructions including addr (exclude breakpoint).
+ */
+struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+{
+	int i;
+	struct kprobe *p = NULL;
+	struct optimized_kprobe *op;
+
+	/* Don't check i == 0, since that is a breakpoint case. */
+	for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
+		p = get_kprobe((void *)(addr - i));
+
+	if (p && kprobe_optready(p)) {
+		op = container_of(p, struct optimized_kprobe, kp);
+		if (arch_within_optimized_kprobe(op, addr))
+			return p;
+	}
+
+	return NULL;
+}
+
+/* Optimization staging list, protected by kprobe_mutex */
+static LIST_HEAD(optimizing_list);
+
+static void kprobe_optimizer(struct work_struct *work);
+static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+#define OPTIMIZE_DELAY 5
+
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+	struct optimized_kprobe *op, *tmp;
+
+	/* Lock modules while optimizing kprobes */
+	mutex_lock(&module_mutex);
+	mutex_lock(&kprobe_mutex);
+	if (kprobes_all_disarmed || !kprobes_allow_optimization)
+		goto end;
+
+	/*
+	 * Wait for quiesence period to ensure all running interrupts
+	 * are done. Because optprobe may modify multiple instructions
+	 * there is a chance that Nth instruction is interrupted. In that
+	 * case, running interrupt can return to 2nd-Nth byte of jump
+	 * instruction. This wait is for avoiding it.
+	 */
+	synchronize_sched();
+
+	/*
+	 * The optimization/unoptimization refers online_cpus via
+	 * stop_machine() and cpu-hotplug modifies online_cpus.
+	 * And same time, text_mutex will be held in cpu-hotplug and here.
+	 * This combination can cause a deadlock (cpu-hotplug try to lock
+	 * text_mutex but stop_machine can not be done because online_cpus
+	 * has been changed)
+	 * To avoid this deadlock, we need to call get_online_cpus()
+	 * for preventing cpu-hotplug outside of text_mutex locking.
+	 */
+	get_online_cpus();
+	mutex_lock(&text_mutex);
+	list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
+		WARN_ON(kprobe_disabled(&op->kp));
+		if (arch_optimize_kprobe(op) < 0)
+			op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+		list_del_init(&op->list);
+	}
+	mutex_unlock(&text_mutex);
+	put_online_cpus();
+end:
+	mutex_unlock(&kprobe_mutex);
+	mutex_unlock(&module_mutex);
+}
+
+/* Optimize kprobe if p is ready to be optimized */
+static __kprobes void optimize_kprobe(struct kprobe *p)
+{
+	struct optimized_kprobe *op;
+
+	/* Check if the kprobe is disabled or not ready for optimization. */
+	if (!kprobe_optready(p) || !kprobes_allow_optimization ||
+	    (kprobe_disabled(p) || kprobes_all_disarmed))
+		return;
+
+	/* Both of break_handler and post_handler are not supported. */
+	if (p->break_handler || p->post_handler)
+		return;
+
+	op = container_of(p, struct optimized_kprobe, kp);
+
+	/* Check there is no other kprobes at the optimized instructions */
+	if (arch_check_optimized_kprobe(op) < 0)
+		return;
+
+	/* Check if it is already optimized. */
+	if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+		return;
+
+	op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
+	list_add(&op->list, &optimizing_list);
+	if (!delayed_work_pending(&optimizing_work))
+		schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+
+/* Unoptimize a kprobe if p is optimized */
+static __kprobes void unoptimize_kprobe(struct kprobe *p)
+{
+	struct optimized_kprobe *op;
+
+	if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
+		op = container_of(p, struct optimized_kprobe, kp);
+		if (!list_empty(&op->list))
+			/* Dequeue from the optimization queue */
+			list_del_init(&op->list);
+		else
+			/* Replace jump with break */
+			arch_unoptimize_kprobe(op);
+		op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+	}
+}
+
+/* Remove optimized instructions */
+static void __kprobes kill_optimized_kprobe(struct kprobe *p)
+{
+	struct optimized_kprobe *op;
+
+	op = container_of(p, struct optimized_kprobe, kp);
+	if (!list_empty(&op->list)) {
+		/* Dequeue from the optimization queue */
+		list_del_init(&op->list);
+		op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+	}
+	/* Don't unoptimize, because the target code will be freed. */
+	arch_remove_optimized_kprobe(op);
+}
+
+/* Try to prepare optimized instructions */
+static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
+{
+	struct optimized_kprobe *op;
+
+	op = container_of(p, struct optimized_kprobe, kp);
+	arch_prepare_optimized_kprobe(op);
+}
+
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+	struct optimized_kprobe *op;
+
+	op = container_of(p, struct optimized_kprobe, kp);
+	arch_remove_optimized_kprobe(op);
+	kfree(op);
+}
+
+/* Allocate new optimized_kprobe and try to prepare optimized instructions */
+static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+	struct optimized_kprobe *op;
+
+	op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
+	if (!op)
+		return NULL;
+
+	INIT_LIST_HEAD(&op->list);
+	op->kp.addr = p->addr;
+	arch_prepare_optimized_kprobe(op);
+
+	return &op->kp;
+}
+
+static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
+
+/*
+ * Prepare an optimized_kprobe and optimize it
+ * NOTE: p must be a normal registered kprobe
+ */
+static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
+{
+	struct kprobe *ap;
+	struct optimized_kprobe *op;
+
+	ap = alloc_aggr_kprobe(p);
+	if (!ap)
+		return;
+
+	op = container_of(ap, struct optimized_kprobe, kp);
+	if (!arch_prepared_optinsn(&op->optinsn)) {
+		/* If failed to setup optimizing, fallback to kprobe */
+		free_aggr_kprobe(ap);
+		return;
+	}
+
+	init_aggr_kprobe(ap, p);
+	optimize_kprobe(ap);
+}
+
+#ifdef CONFIG_SYSCTL
+static void __kprobes optimize_all_kprobes(void)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct kprobe *p;
+	unsigned int i;
+
+	/* If optimization is already allowed, just return */
+	if (kprobes_allow_optimization)
+		return;
+
+	kprobes_allow_optimization = true;
+	mutex_lock(&text_mutex);
+	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+		head = &kprobe_table[i];
+		hlist_for_each_entry_rcu(p, node, head, hlist)
+			if (!kprobe_disabled(p))
+				optimize_kprobe(p);
+	}
+	mutex_unlock(&text_mutex);
+	printk(KERN_INFO "Kprobes globally optimized\n");
+}
+
+static void __kprobes unoptimize_all_kprobes(void)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct kprobe *p;
+	unsigned int i;
+
+	/* If optimization is already prohibited, just return */
+	if (!kprobes_allow_optimization)
+		return;
+
+	kprobes_allow_optimization = false;
+	printk(KERN_INFO "Kprobes globally unoptimized\n");
+	get_online_cpus();	/* For avoiding text_mutex deadlock */
+	mutex_lock(&text_mutex);
+	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+		head = &kprobe_table[i];
+		hlist_for_each_entry_rcu(p, node, head, hlist) {
+			if (!kprobe_disabled(p))
+				unoptimize_kprobe(p);
+		}
+	}
+
+	mutex_unlock(&text_mutex);
+	put_online_cpus();
+	/* Allow all currently running kprobes to complete */
+	synchronize_sched();
+}
+
+int sysctl_kprobes_optimization;
+int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
+				      void __user *buffer, size_t *length,
+				      loff_t *ppos)
+{
+	int ret;
+
+	mutex_lock(&kprobe_mutex);
+	sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+	if (sysctl_kprobes_optimization)
+		optimize_all_kprobes();
+	else
+		unoptimize_all_kprobes();
+	mutex_unlock(&kprobe_mutex);
+
+	return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
+static void __kprobes __arm_kprobe(struct kprobe *p)
+{
+	struct kprobe *old_p;
+
+	/* Check collision with other optimized kprobes */
+	old_p = get_optimized_kprobe((unsigned long)p->addr);
+	if (unlikely(old_p))
+		unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+
+	arch_arm_kprobe(p);
+	optimize_kprobe(p);	/* Try to optimize (add kprobe to a list) */
+}
+
+static void __kprobes __disarm_kprobe(struct kprobe *p)
+{
+	struct kprobe *old_p;
+
+	unoptimize_kprobe(p);	/* Try to unoptimize */
+	arch_disarm_kprobe(p);
+
+	/* If another kprobe was blocked, optimize it. */
+	old_p = get_optimized_kprobe((unsigned long)p->addr);
+	if (unlikely(old_p))
+		optimize_kprobe(old_p);
+}
+
+#else /* !CONFIG_OPTPROBES */
+
+#define optimize_kprobe(p)			do {} while (0)
+#define unoptimize_kprobe(p)			do {} while (0)
+#define kill_optimized_kprobe(p)		do {} while (0)
+#define prepare_optimized_kprobe(p)		do {} while (0)
+#define try_to_optimize_kprobe(p)		do {} while (0)
+#define __arm_kprobe(p)				arch_arm_kprobe(p)
+#define __disarm_kprobe(p)			arch_disarm_kprobe(p)
+
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+	kfree(p);
+}
+
+static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+	return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+}
+#endif /* CONFIG_OPTPROBES */
+
 /* Arm a kprobe with text_mutex */
 static void __kprobes arm_kprobe(struct kprobe *kp)
 {
+	/*
+	 * Here, since __arm_kprobe() doesn't use stop_machine(),
+	 * this doesn't cause deadlock on text_mutex. So, we don't
+	 * need get_online_cpus().
+	 */
 	mutex_lock(&text_mutex);
-	arch_arm_kprobe(kp);
+	__arm_kprobe(kp);
 	mutex_unlock(&text_mutex);
 }
 
 /* Disarm a kprobe with text_mutex */
 static void __kprobes disarm_kprobe(struct kprobe *kp)
 {
+	get_online_cpus();	/* For avoiding text_mutex deadlock */
 	mutex_lock(&text_mutex);
-	arch_disarm_kprobe(kp);
+	__disarm_kprobe(kp);
 	mutex_unlock(&text_mutex);
+	put_online_cpus();
 }
 
 /*
@@ -392,7 +803,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
 {
 	struct kprobe *kp;
-	if (p->pre_handler != aggr_pre_handler) {
+	if (!kprobe_aggrprobe(p)) {
 		p->nmissed++;
 	} else {
 		list_for_each_entry_rcu(kp, &p->list, list)
@@ -516,21 +927,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 }
 
 /*
- * Keep all fields in the kprobe consistent
- */
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
-{
-	memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
-	memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
-}
-
-/*
 * Add the new probe to ap->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
 static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
 	BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
+
+	if (p->break_handler || p->post_handler)
+		unoptimize_kprobe(ap);	/* Fall back to normal kprobe */
+
 	if (p->break_handler) {
 		if (ap->break_handler)
 			return -EEXIST;
@@ -545,7 +951,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 		ap->flags &= ~KPROBE_FLAG_DISABLED;
 		if (!kprobes_all_disarmed)
 			/* Arm the breakpoint again. */
-			arm_kprobe(ap);
+			__arm_kprobe(ap);
 	}
 	return 0;
 }
@@ -554,12 +960,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
  * Fill in the required fields of the "manager kprobe". Replace the
  * earlier kprobe in the hlist with the manager kprobe
  */
-static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
+	/* Copy p's insn slot to ap */
 	copy_kprobe(p, ap);
 	flush_insn_slot(ap);
 	ap->addr = p->addr;
-	ap->flags = p->flags;
+	ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
 	ap->pre_handler = aggr_pre_handler;
 	ap->fault_handler = aggr_fault_handler;
 	/* We don't care the kprobe which has gone. */
@@ -569,8 +976,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 		ap->break_handler = aggr_break_handler;
 
 	INIT_LIST_HEAD(&ap->list);
-	list_add_rcu(&p->list, &ap->list);
+	INIT_HLIST_NODE(&ap->hlist);
 
+	list_add_rcu(&p->list, &ap->list);
 	hlist_replace_rcu(&p->hlist, &ap->hlist);
 }
 
@@ -584,12 +992,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 	int ret = 0;
 	struct kprobe *ap = old_p;
 
-	if (old_p->pre_handler != aggr_pre_handler) {
-		/* If old_p is not an aggr_probe, create new aggr_kprobe. */
-		ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+	if (!kprobe_aggrprobe(old_p)) {
+		/* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
+		ap = alloc_aggr_kprobe(old_p);
 		if (!ap)
 			return -ENOMEM;
-		add_aggr_kprobe(ap, old_p);
+		init_aggr_kprobe(ap, old_p);
 	}
 
 	if (kprobe_gone(ap)) {
@@ -608,6 +1016,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 			 */
 			return ret;
 
+		/* Prepare optimized instructions if possible. */
+		prepare_optimized_kprobe(ap);
+
 		/*
 		 * Clear gone flag to prevent allocating new slot again, and
 		 * set disabled flag because it is not armed yet.
@@ -616,6 +1027,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 			    | KPROBE_FLAG_DISABLED;
 	}
 
+	/* Copy ap's insn slot to p */
 	copy_kprobe(ap, p);
 	return add_new_kprobe(ap, p);
 }
@@ -728,7 +1140,8 @@ int __kprobes register_kprobe(struct kprobe *p)
 
 	preempt_disable();
 	if (!kernel_text_address((unsigned long) p->addr) ||
-	    in_kprobes_functions((unsigned long) p->addr)) {
+	    in_kprobes_functions((unsigned long) p->addr) ||
+	    ftrace_text_reserved(p->addr, p->addr)) {
 		preempt_enable();
 		return -EINVAL;
 	}
@@ -765,27 +1178,34 @@ int __kprobes register_kprobe(struct kprobe *p)
 	p->nmissed = 0;
 	INIT_LIST_HEAD(&p->list);
 	mutex_lock(&kprobe_mutex);
+
+	get_online_cpus();	/* For avoiding text_mutex deadlock. */
+	mutex_lock(&text_mutex);
+
 	old_p = get_kprobe(p->addr);
 	if (old_p) {
+		/* Since this may unoptimize old_p, locking text_mutex. */
 		ret = register_aggr_kprobe(old_p, p);
 		goto out;
 	}
 
-	mutex_lock(&text_mutex);
 	ret = arch_prepare_kprobe(p);
 	if (ret)
-		goto out_unlock_text;
+		goto out;
 
 	INIT_HLIST_NODE(&p->hlist);
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
 	if (!kprobes_all_disarmed && !kprobe_disabled(p))
-		arch_arm_kprobe(p);
+		__arm_kprobe(p);
+
+	/* Try to optimize kprobe */
+	try_to_optimize_kprobe(p);
 
-out_unlock_text:
-	mutex_unlock(&text_mutex);
 out:
+	mutex_unlock(&text_mutex);
+	put_online_cpus();
 	mutex_unlock(&kprobe_mutex);
 
 	if (probed_mod)
@@ -807,7 +1227,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
 		return -EINVAL;
 
 	if (old_p == p ||
-	    (old_p->pre_handler == aggr_pre_handler &&
+	    (kprobe_aggrprobe(old_p) &&
 	     list_is_singular(&old_p->list))) {
 		/*
 		 * Only probe on the hash list. Disarm only if kprobes are
@@ -815,7 +1235,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
 		 * already have been removed. We save on flushing icache.
 		 */
 		if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
-			disarm_kprobe(p);
+			disarm_kprobe(old_p);
 		hlist_del_rcu(&old_p->hlist);
 	} else {
 		if (p->break_handler && !kprobe_gone(p))
@@ -831,8 +1251,13 @@ noclean:
 		list_del_rcu(&p->list);
 		if (!kprobe_disabled(old_p)) {
 			try_to_disable_aggr_kprobe(old_p);
-			if (!kprobes_all_disarmed && kprobe_disabled(old_p))
-				disarm_kprobe(old_p);
+			if (!kprobes_all_disarmed) {
+				if (kprobe_disabled(old_p))
+					disarm_kprobe(old_p);
+				else
+					/* Try to optimize this probe again */
+					optimize_kprobe(old_p);
+			}
 		}
 	}
 	return 0;
@@ -849,7 +1274,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
 		old_p = list_entry(p->list.next, struct kprobe, list);
 		list_del(&p->list);
 		arch_remove_kprobe(old_p);
-		kfree(old_p);
+		free_aggr_kprobe(old_p);
 	}
 }
 
@@ -1145,7 +1570,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
 	struct kprobe *kp;
 
 	p->flags |= KPROBE_FLAG_GONE;
-	if (p->pre_handler == aggr_pre_handler) {
+	if (kprobe_aggrprobe(p)) {
 		/*
 		 * If this is an aggr_kprobe, we have to list all the
 		 * chained probes and mark them GONE.
@@ -1154,6 +1579,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
 			kp->flags |= KPROBE_FLAG_GONE;
 		p->post_handler = NULL;
 		p->break_handler = NULL;
+		kill_optimized_kprobe(p);
 	}
 	/*
 	 * Here, we can remove insn_slot safely, because no thread calls
@@ -1162,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p)
 	arch_remove_kprobe(p);
 }
 
+/* Disable one kprobe */
+int __kprobes disable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* If the probe is already disabled (or gone), just return */
+	if (kprobe_disabled(kp))
+		goto out;
+
+	kp->flags |= KPROBE_FLAG_DISABLED;
+	if (p != kp)
+		/* When kp != p, p is always enabled. */
+		try_to_disable_aggr_kprobe(p);
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p))
+		disarm_kprobe(p);
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(disable_kprobe);
+
+/* Enable one kprobe */
+int __kprobes enable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (kprobe_gone(kp)) {
+		/* This kprobe has gone, we couldn't enable it. */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (p != kp)
+		kp->flags &= ~KPROBE_FLAG_DISABLED;
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p)) {
+		p->flags &= ~KPROBE_FLAG_DISABLED;
+		arm_kprobe(p);
+	}
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(enable_kprobe);
+
 void __kprobes dump_kprobe(struct kprobe *kp)
 {
 	printk(KERN_WARNING "Dumping kprobe:\n");
@@ -1263,6 +1755,15 @@ static int __init init_kprobes(void)
 		}
 	}
 
+#if defined(CONFIG_OPTPROBES)
+#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+	/* Init kprobe_optinsn_slots */
+	kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+#endif
+	/* By default, kprobes can be optimized */
+	kprobes_allow_optimization = true;
+#endif
+
 	/* By default, kprobes are armed */
 	kprobes_all_disarmed = false;
 
@@ -1281,7 +1782,7 @@ static int __init init_kprobes(void)
 
 #ifdef CONFIG_DEBUG_FS
 static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
-		const char *sym, int offset,char *modname)
+		const char *sym, int offset, char *modname, struct kprobe *pp)
 {
 	char *kprobe_type;
 
@@ -1291,19 +1792,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
 		kprobe_type = "j";
 	else
 		kprobe_type = "k";
+
 	if (sym)
-		seq_printf(pi, "%p  %s  %s+0x%x  %s %s%s\n",
+		seq_printf(pi, "%p  %s  %s+0x%x  %s ",
 			p->addr, kprobe_type, sym, offset,
-			(modname ? modname : " "),
-			(kprobe_gone(p) ? "[GONE]" : ""),
-			((kprobe_disabled(p) && !kprobe_gone(p)) ?
-			 "[DISABLED]" : ""));
+			(modname ? modname : " "));
 	else
-		seq_printf(pi, "%p  %s  %p %s%s\n",
-			p->addr, kprobe_type, p->addr,
-			(kprobe_gone(p) ? "[GONE]" : ""),
-			((kprobe_disabled(p) && !kprobe_gone(p)) ?
-			 "[DISABLED]" : ""));
+		seq_printf(pi, "%p  %s  %p ",
+			p->addr, kprobe_type, p->addr);
+
+	if (!pp)
+		pp = p;
+	seq_printf(pi, "%s%s%s\n",
+		(kprobe_gone(p) ? "[GONE]" : ""),
+		((kprobe_disabled(p) && !kprobe_gone(p)) ?  "[DISABLED]" : ""),
+		(kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
 }
 
 static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1339,11 +1842,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
 	hlist_for_each_entry_rcu(p, node, head, hlist) {
 		sym = kallsyms_lookup((unsigned long)p->addr, NULL,
 					&offset, &modname, namebuf);
-		if (p->pre_handler == aggr_pre_handler) {
+		if (kprobe_aggrprobe(p)) {
 			list_for_each_entry_rcu(kp, &p->list, list)
-				report_probe(pi, kp, sym, offset, modname);
+				report_probe(pi, kp, sym, offset, modname, p);
 		} else
-			report_probe(pi, p, sym, offset, modname);
+			report_probe(pi, p, sym, offset, modname, NULL);
 	}
 	preempt_enable();
 	return 0;
@@ -1368,71 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = {
 	.release        = seq_release,
 };
 
-/* Disable one kprobe */
-int __kprobes disable_kprobe(struct kprobe *kp)
-{
-	int ret = 0;
-	struct kprobe *p;
-
-	mutex_lock(&kprobe_mutex);
-
-	/* Check whether specified probe is valid. */
-	p = __get_valid_kprobe(kp);
-	if (unlikely(p == NULL)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* If the probe is already disabled (or gone), just return */
-	if (kprobe_disabled(kp))
-		goto out;
-
-	kp->flags |= KPROBE_FLAG_DISABLED;
-	if (p != kp)
-		/* When kp != p, p is always enabled. */
-		try_to_disable_aggr_kprobe(p);
-
-	if (!kprobes_all_disarmed && kprobe_disabled(p))
-		disarm_kprobe(p);
-out:
-	mutex_unlock(&kprobe_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(disable_kprobe);
-
-/* Enable one kprobe */
-int __kprobes enable_kprobe(struct kprobe *kp)
-{
-	int ret = 0;
-	struct kprobe *p;
-
-	mutex_lock(&kprobe_mutex);
-
-	/* Check whether specified probe is valid. */
-	p = __get_valid_kprobe(kp);
-	if (unlikely(p == NULL)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (kprobe_gone(kp)) {
-		/* This kprobe has gone, we couldn't enable it. */
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (!kprobes_all_disarmed && kprobe_disabled(p))
-		arm_kprobe(p);
-
-	p->flags &= ~KPROBE_FLAG_DISABLED;
-	if (p != kp)
-		kp->flags &= ~KPROBE_FLAG_DISABLED;
-out:
-	mutex_unlock(&kprobe_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(enable_kprobe);
-
 static void __kprobes arm_all_kprobes(void)
 {
 	struct hlist_head *head;
@@ -1446,12 +1884,13 @@ static void __kprobes arm_all_kprobes(void)
 	if (!kprobes_all_disarmed)
 		goto already_enabled;
 
+	/* Arming kprobes doesn't optimize kprobe itself */
 	mutex_lock(&text_mutex);
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist)
 			if (!kprobe_disabled(p))
-				arch_arm_kprobe(p);
+				__arm_kprobe(p);
 	}
 	mutex_unlock(&text_mutex);
 
@@ -1478,16 +1917,23 @@ static void __kprobes disarm_all_kprobes(void)
 
 	kprobes_all_disarmed = true;
 	printk(KERN_INFO "Kprobes globally disabled\n");
+
+	/*
+	 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
+	 * because disarming may also unoptimize kprobes.
+	 */
+	get_online_cpus();
 	mutex_lock(&text_mutex);
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist) {
 			if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-				arch_disarm_kprobe(p);
+				__disarm_kprobe(p);
 		}
 	}
 
 	mutex_unlock(&text_mutex);
+	put_online_cpus();
 	mutex_unlock(&kprobe_mutex);
 	/* Allow all currently running kprobes to complete */
 	synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a7451..0b624e79180 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
-/* uevent helper program, used during early boo */
+/* uevent helper program, used during early boot */
 static ssize_t uevent_helper_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
 {
@@ -138,7 +138,8 @@ extern const void __start_notes __attribute__((weak));
 extern const void __stop_notes __attribute__((weak));
 #define	notes_size (&__stop_notes - &__start_notes)
 
-static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+static ssize_t notes_read(struct file *filp, struct kobject *kobj,
+			  struct bin_attribute *bin_attr,
 			  char *buf, loff_t off, size_t count)
 {
 	memcpy(buf, &__start_notes + off, count);
@@ -197,16 +198,8 @@ static int __init ksysfs_init(void)
 			goto group_exit;
 	}
 
-	/* create the /sys/kernel/uids/ directory */
-	error = uids_sysfs_init();
-	if (error)
-		goto notes_exit;
-
 	return 0;
 
-notes_exit:
-	if (notes_size > 0)
-		sysfs_remove_bin_file(kernel_kobj, &notes_attr);
 group_exit:
 	sysfs_remove_group(kernel_kobj, &kernel_attr_group);
 kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e..2dc3786349d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,8 @@
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/freezer.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -35,6 +37,7 @@ struct kthread_create_info
 
 struct kthread {
 	int should_stop;
+	void *data;
 	struct completion exited;
 };
 
@@ -54,6 +57,19 @@ int kthread_should_stop(void)
 }
 EXPORT_SYMBOL(kthread_should_stop);
 
+/**
+ * kthread_data - return data value specified on kthread creation
+ * @task: kthread task in question
+ *
+ * Return the data value specified when kthread @task was created.
+ * The caller is responsible for ensuring the validity of @task when
+ * calling this function.
+ */
+void *kthread_data(struct task_struct *task)
+{
+	return to_kthread(task)->data;
+}
+
 static int kthread(void *_create)
 {
 	/* Copy data: it's on kthread's stack */
@@ -64,6 +80,7 @@ static int kthread(void *_create)
 	int ret;
 
 	self.should_stop = 0;
+	self.data = data;
 	init_completion(&self.exited);
 	current->vfork_done = &self.exited;
 
@@ -101,7 +118,7 @@ static void create_kthread(struct kthread_create_info *create)
  *
  * Description: This helper function creates and names a kernel
  * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run(), kthread_create_on_cpu().
+ * it.  See also kthread_run().
  *
  * When woken, the thread will run @threadfn() with @data as its
  * argument. @threadfn() can either call do_exit() directly if it is a
@@ -219,7 +236,7 @@ int kthreadd(void *unused)
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
 	set_cpus_allowed_ptr(tsk, cpu_all_mask);
-	set_mems_allowed(node_possible_map);
+	set_mems_allowed(node_states[N_HIGH_MEMORY]);
 
 	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
@@ -247,3 +264,150 @@ int kthreadd(void *unused)
 
 	return 0;
 }
+
+/**
+ * kthread_worker_fn - kthread function to process kthread_worker
+ * @worker_ptr: pointer to initialized kthread_worker
+ *
+ * This function can be used as @threadfn to kthread_create() or
+ * kthread_run() with @worker_ptr argument pointing to an initialized
+ * kthread_worker.  The started kthread will process work_list until
+ * the it is stopped with kthread_stop().  A kthread can also call
+ * this function directly after extra initialization.
+ *
+ * Different kthreads can be used for the same kthread_worker as long
+ * as there's only one kthread attached to it at any given time.  A
+ * kthread_worker without an attached kthread simply collects queued
+ * kthread_works.
+ */
+int kthread_worker_fn(void *worker_ptr)
+{
+	struct kthread_worker *worker = worker_ptr;
+	struct kthread_work *work;
+
+	WARN_ON(worker->task);
+	worker->task = current;
+repeat:
+	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+
+	if (kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		spin_lock_irq(&worker->lock);
+		worker->task = NULL;
+		spin_unlock_irq(&worker->lock);
+		return 0;
+	}
+
+	work = NULL;
+	spin_lock_irq(&worker->lock);
+	if (!list_empty(&worker->work_list)) {
+		work = list_first_entry(&worker->work_list,
+					struct kthread_work, node);
+		list_del_init(&work->node);
+	}
+	spin_unlock_irq(&worker->lock);
+
+	if (work) {
+		__set_current_state(TASK_RUNNING);
+		work->func(work);
+		smp_wmb();	/* wmb worker-b0 paired with flush-b1 */
+		work->done_seq = work->queue_seq;
+		smp_mb();	/* mb worker-b1 paired with flush-b0 */
+		if (atomic_read(&work->flushing))
+			wake_up_all(&work->done);
+	} else if (!freezing(current))
+		schedule();
+
+	try_to_freeze();
+	goto repeat;
+}
+EXPORT_SYMBOL_GPL(kthread_worker_fn);
+
+/**
+ * queue_kthread_work - queue a kthread_work
+ * @worker: target kthread_worker
+ * @work: kthread_work to queue
+ *
+ * Queue @work to work processor @task for async execution.  @task
+ * must have been created with kthread_worker_create().  Returns %true
+ * if @work was successfully queued, %false if it was already pending.
+ */
+bool queue_kthread_work(struct kthread_worker *worker,
+			struct kthread_work *work)
+{
+	bool ret = false;
+	unsigned long flags;
+
+	spin_lock_irqsave(&worker->lock, flags);
+	if (list_empty(&work->node)) {
+		list_add_tail(&work->node, &worker->work_list);
+		work->queue_seq++;
+		if (likely(worker->task))
+			wake_up_process(worker->task);
+		ret = true;
+	}
+	spin_unlock_irqrestore(&worker->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(queue_kthread_work);
+
+/**
+ * flush_kthread_work - flush a kthread_work
+ * @work: work to flush
+ *
+ * If @work is queued or executing, wait for it to finish execution.
+ */
+void flush_kthread_work(struct kthread_work *work)
+{
+	int seq = work->queue_seq;
+
+	atomic_inc(&work->flushing);
+
+	/*
+	 * mb flush-b0 paired with worker-b1, to make sure either
+	 * worker sees the above increment or we see done_seq update.
+	 */
+	smp_mb__after_atomic_inc();
+
+	/* A - B <= 0 tests whether B is in front of A regardless of overflow */
+	wait_event(work->done, seq - work->done_seq <= 0);
+	atomic_dec(&work->flushing);
+
+	/*
+	 * rmb flush-b1 paired with worker-b0, to make sure our caller
+	 * sees every change made by work->func().
+	 */
+	smp_mb__after_atomic_dec();
+}
+EXPORT_SYMBOL_GPL(flush_kthread_work);
+
+struct kthread_flush_work {
+	struct kthread_work	work;
+	struct completion	done;
+};
+
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+	struct kthread_flush_work *fwork =
+		container_of(work, struct kthread_flush_work, work);
+	complete(&fwork->done);
+}
+
+/**
+ * flush_kthread_worker - flush all current works on a kthread_worker
+ * @worker: worker to flush
+ *
+ * Wait until all currently executing or pending works on @worker are
+ * finished.
+ */
+void flush_kthread_worker(struct kthread_worker *worker)
+{
+	struct kthread_flush_work fwork = {
+		KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+		COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+	};
+
+	queue_kthread_work(worker, &fwork.work);
+	wait_for_completion(&fwork.done);
+}
+EXPORT_SYMBOL_GPL(flush_kthread_worker);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ca07c5c0c91..877fb306d41 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -56,7 +56,6 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/list.h>
-#include <linux/slab.h>
 #include <linux/stacktrace.h>
 
 static DEFINE_SPINLOCK(latency_lock);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 5feaddcdbe4..f2852a51023 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,6 +43,7 @@
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
 #include <linux/bitops.h>
+#include <linux/gfp.h>
 
 #include <asm/sections.h>
 
@@ -145,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
 
 static inline u64 lockstat_clock(void)
 {
-	return cpu_clock(smp_processor_id());
+	return local_clock();
 }
 
 static int lock_point(unsigned long points[], unsigned long ip)
@@ -430,20 +431,7 @@ static struct stack_trace lockdep_init_trace = {
 /*
  * Various lockdep statistics:
  */
-atomic_t chain_lookup_hits;
-atomic_t chain_lookup_misses;
-atomic_t hardirqs_on_events;
-atomic_t hardirqs_off_events;
-atomic_t redundant_hardirqs_on;
-atomic_t redundant_hardirqs_off;
-atomic_t softirqs_on_events;
-atomic_t softirqs_off_events;
-atomic_t redundant_softirqs_on;
-atomic_t redundant_softirqs_off;
-atomic_t nr_unused_locks;
-atomic_t nr_cyclic_checks;
-atomic_t nr_find_usage_forwards_checks;
-atomic_t nr_find_usage_backwards_checks;
+DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
 #endif
 
 /*
@@ -582,9 +570,6 @@ static int static_obj(void *obj)
 	unsigned long start = (unsigned long) &_stext,
 		      end   = (unsigned long) &_end,
 		      addr  = (unsigned long) obj;
-#ifdef CONFIG_SMP
-	int i;
-#endif
 
 	/*
 	 * static variable?
@@ -595,24 +580,16 @@ static int static_obj(void *obj)
 	if (arch_is_kernel_data(addr))
 		return 1;
 
-#ifdef CONFIG_SMP
 	/*
-	 * percpu var?
+	 * in-kernel percpu var?
 	 */
-	for_each_possible_cpu(i) {
-		start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
-		end   = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
-					+ per_cpu_offset(i);
-
-		if ((addr >= start) && (addr < end))
-			return 1;
-	}
-#endif
+	if (is_kernel_percpu_address(addr))
+		return 1;
 
 	/*
-	 * module var?
+	 * module static or percpu var?
 	 */
-	return is_module_address(addr);
+	return is_module_address(addr) || is_module_percpu_address(addr);
 }
 
 /*
@@ -758,7 +735,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 		return NULL;
 	}
 	class = lock_classes + nr_lock_classes++;
-	debug_atomic_inc(&nr_unused_locks);
+	debug_atomic_inc(nr_unused_locks);
 	class->key = key;
 	class->name = lock->name;
 	class->subclass = subclass;
@@ -828,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
  * Add a new dependency to the head of the list:
  */
 static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
-			    struct list_head *head, unsigned long ip, int distance)
+			    struct list_head *head, unsigned long ip,
+			    int distance, struct stack_trace *trace)
 {
 	struct lock_list *entry;
 	/*
@@ -839,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
 	if (!entry)
 		return 0;
 
-	if (!save_trace(&entry->trace))
-		return 0;
-
 	entry->class = this;
 	entry->distance = distance;
+	entry->trace = *trace;
 	/*
 	 * Since we never remove from the dependency list, the list can
 	 * be walked lockless by other CPUs, it's only allocation
@@ -1215,7 +1191,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
 {
 	int result;
 
-	debug_atomic_inc(&nr_cyclic_checks);
+	debug_atomic_inc(nr_cyclic_checks);
 
 	result = __bfs_forwards(root, target, class_equal, target_entry);
 
@@ -1252,7 +1228,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
 {
 	int result;
 
-	debug_atomic_inc(&nr_find_usage_forwards_checks);
+	debug_atomic_inc(nr_find_usage_forwards_checks);
 
 	result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
 
@@ -1275,7 +1251,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
 {
 	int result;
 
-	debug_atomic_inc(&nr_find_usage_backwards_checks);
+	debug_atomic_inc(nr_find_usage_backwards_checks);
 
 	result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
 
@@ -1645,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
  */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-	       struct held_lock *next, int distance)
+	       struct held_lock *next, int distance, int trylock_loop)
 {
 	struct lock_list *entry;
 	int ret;
 	struct lock_list this;
 	struct lock_list *uninitialized_var(target_entry);
+	/*
+	 * Static variable, serialized by the graph_lock().
+	 *
+	 * We use this static variable to save the stack trace in case
+	 * we call into this function multiple times due to encountering
+	 * trylocks in the held lock stack.
+	 */
+	static struct stack_trace trace;
 
 	/*
 	 * Prove that the new <prev> -> <next> dependency would not
@@ -1698,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		}
 	}
 
+	if (!trylock_loop && !save_trace(&trace))
+		return 0;
+
 	/*
 	 * Ok, all validations passed, add the new lock
 	 * to the previous lock's dependency list:
 	 */
 	ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
 			       &hlock_class(prev)->locks_after,
-			       next->acquire_ip, distance);
+			       next->acquire_ip, distance, &trace);
 
 	if (!ret)
 		return 0;
 
 	ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
 			       &hlock_class(next)->locks_before,
-			       next->acquire_ip, distance);
+			       next->acquire_ip, distance, &trace);
 	if (!ret)
 		return 0;
 
@@ -1741,6 +1728,7 @@ static int
 check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
 	int depth = curr->lockdep_depth;
+	int trylock_loop = 0;
 	struct held_lock *hlock;
 
 	/*
@@ -1766,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		 * added:
 		 */
 		if (hlock->read != 2) {
-			if (!check_prev_add(curr, hlock, next, distance))
+			if (!check_prev_add(curr, hlock, next,
+						distance, trylock_loop))
 				return 0;
 			/*
 			 * Stop after the first non-trylock entry,
@@ -1789,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		if (curr->held_locks[depth].irq_context !=
 				curr->held_locks[depth-1].irq_context)
 			break;
+		trylock_loop = 1;
 	}
 	return 1;
 out_bug:
@@ -1835,7 +1825,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 	list_for_each_entry(chain, hash_head, entry) {
 		if (chain->chain_key == chain_key) {
 cache_hit:
-			debug_atomic_inc(&chain_lookup_hits);
+			debug_atomic_inc(chain_lookup_hits);
 			if (very_verbose(class))
 				printk("\nhash chain already cached, key: "
 					"%016Lx tail class: [%p] %s\n",
@@ -1900,7 +1890,7 @@ cache_hit:
 		chain_hlocks[chain->base + j] = class - lock_classes;
 	}
 	list_add_tail_rcu(&chain->entry, hash_head);
-	debug_atomic_inc(&chain_lookup_misses);
+	debug_atomic_inc(chain_lookup_misses);
 	inc_chains();
 
 	return 1;
@@ -2147,7 +2137,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 		return ret;
 
 	return print_irq_inversion_bug(curr, &root, target_entry,
-					this, 1, irqclass);
+					this, 0, irqclass);
 }
 
 void print_irqtrace_events(struct task_struct *curr)
@@ -2321,7 +2311,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
 		return;
 
 	if (unlikely(curr->hardirqs_enabled)) {
-		debug_atomic_inc(&redundant_hardirqs_on);
+		/*
+		 * Neither irq nor preemption are disabled here
+		 * so this is racy by nature but loosing one hit
+		 * in a stat is not a big deal.
+		 */
+		__debug_atomic_inc(redundant_hardirqs_on);
 		return;
 	}
 	/* we'll do an OFF -> ON transition: */
@@ -2348,7 +2343,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 
 	curr->hardirq_enable_ip = ip;
 	curr->hardirq_enable_event = ++curr->irq_events;
-	debug_atomic_inc(&hardirqs_on_events);
+	debug_atomic_inc(hardirqs_on_events);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
@@ -2380,9 +2375,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
 		curr->hardirqs_enabled = 0;
 		curr->hardirq_disable_ip = ip;
 		curr->hardirq_disable_event = ++curr->irq_events;
-		debug_atomic_inc(&hardirqs_off_events);
+		debug_atomic_inc(hardirqs_off_events);
 	} else
-		debug_atomic_inc(&redundant_hardirqs_off);
+		debug_atomic_inc(redundant_hardirqs_off);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
@@ -2406,7 +2401,7 @@ void trace_softirqs_on(unsigned long ip)
 		return;
 
 	if (curr->softirqs_enabled) {
-		debug_atomic_inc(&redundant_softirqs_on);
+		debug_atomic_inc(redundant_softirqs_on);
 		return;
 	}
 
@@ -2416,7 +2411,7 @@ void trace_softirqs_on(unsigned long ip)
 	curr->softirqs_enabled = 1;
 	curr->softirq_enable_ip = ip;
 	curr->softirq_enable_event = ++curr->irq_events;
-	debug_atomic_inc(&softirqs_on_events);
+	debug_atomic_inc(softirqs_on_events);
 	/*
 	 * We are going to turn softirqs on, so set the
 	 * usage bit for all held locks, if hardirqs are
@@ -2446,10 +2441,10 @@ void trace_softirqs_off(unsigned long ip)
 		curr->softirqs_enabled = 0;
 		curr->softirq_disable_ip = ip;
 		curr->softirq_disable_event = ++curr->irq_events;
-		debug_atomic_inc(&softirqs_off_events);
+		debug_atomic_inc(softirqs_off_events);
 		DEBUG_LOCKS_WARN_ON(!softirq_count());
 	} else
-		debug_atomic_inc(&redundant_softirqs_off);
+		debug_atomic_inc(redundant_softirqs_off);
 }
 
 static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2654,7 +2649,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 			return 0;
 		break;
 	case LOCK_USED:
-		debug_atomic_dec(&nr_unused_locks);
+		debug_atomic_dec(nr_unused_locks);
 		break;
 	default:
 		if (!debug_locks_off_graph_unlock())
@@ -2716,6 +2711,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lockdep_init_map);
 
+struct lock_class_key __lockdep_no_validate__;
+
 /*
  * This gets called for every mutex_lock*()/spin_lock*() operation.
  * We maintain the dependency maps and validate the locking attempt:
@@ -2750,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		return 0;
 	}
 
+	if (lock->key == &__lockdep_no_validate__)
+		check = 1;
+
 	if (!subclass)
 		class = lock->class_cache;
 	/*
@@ -2760,7 +2760,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		if (!class)
 			return 0;
 	}
-	debug_atomic_inc((atomic_t *)&class->ops);
+	atomic_inc((atomic_t *)&class->ops);
 	if (very_verbose(class)) {
 		printk("\nacquire class [%p] %s", class->key, class->name);
 		if (class->name_version > 1)
@@ -3211,8 +3211,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 {
 	unsigned long flags;
 
-	trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
-
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -3220,6 +3218,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	check_flags(flags);
 
 	current->lockdep_recursion = 1;
+	trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
 	__lock_acquire(lock, subclass, trylock, read, check,
 		       irqs_disabled_flags(flags), nest_lock, ip, 0);
 	current->lockdep_recursion = 0;
@@ -3232,14 +3231,13 @@ void lock_release(struct lockdep_map *lock, int nested,
 {
 	unsigned long flags;
 
-	trace_lock_release(lock, nested, ip);
-
 	if (unlikely(current->lockdep_recursion))
 		return;
 
 	raw_local_irq_save(flags);
 	check_flags(flags);
 	current->lockdep_recursion = 1;
+	trace_lock_release(lock, ip);
 	__lock_release(lock, nested, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
@@ -3392,7 +3390,7 @@ found_it:
 		hlock->holdtime_stamp = now;
 	}
 
-	trace_lock_acquired(lock, ip, waittime);
+	trace_lock_acquired(lock, ip);
 
 	stats = get_lock_stats(hlock_class(hlock));
 	if (waittime) {
@@ -3413,8 +3411,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
-	trace_lock_contended(lock, ip);
-
 	if (unlikely(!lock_stat))
 		return;
 
@@ -3424,6 +3420,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 	raw_local_irq_save(flags);
 	check_flags(flags);
 	current->lockdep_recursion = 1;
+	trace_lock_contended(lock, ip);
 	__lock_contended(lock, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
@@ -3809,3 +3806,25 @@ void lockdep_sys_exit(void)
 		lockdep_print_held_locks(curr);
 	}
 }
+
+void lockdep_rcu_dereference(const char *file, const int line)
+{
+	struct task_struct *curr = current;
+
+#ifndef CONFIG_PROVE_RCU_REPEATEDLY
+	if (!debug_locks_off())
+		return;
+#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
+	/* Note: the following can be executed concurrently, so be careful. */
+	printk("\n===================================================\n");
+	printk(  "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
+	printk(  "---------------------------------------------------\n");
+	printk("%s:%d invoked rcu_dereference_check() without protection!\n",
+			file, line);
+	printk("\nother info that might help us debug this:\n\n");
+	printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
+	lockdep_print_held_locks(curr);
+	printk("\nstack backtrace:\n");
+	dump_stack();
+}
+EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad131..4f560cfedc8 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,30 +110,60 @@ lockdep_count_backward_deps(struct lock_class *class)
 #endif
 
 #ifdef CONFIG_DEBUG_LOCKDEP
+
+#include <asm/local.h>
 /*
- * Various lockdep statistics:
+ * Various lockdep statistics.
+ * We want them per cpu as they are often accessed in fast path
+ * and we want to avoid too much cache bouncing.
  */
-extern atomic_t chain_lookup_hits;
-extern atomic_t chain_lookup_misses;
-extern atomic_t hardirqs_on_events;
-extern atomic_t hardirqs_off_events;
-extern atomic_t redundant_hardirqs_on;
-extern atomic_t redundant_hardirqs_off;
-extern atomic_t softirqs_on_events;
-extern atomic_t softirqs_off_events;
-extern atomic_t redundant_softirqs_on;
-extern atomic_t redundant_softirqs_off;
-extern atomic_t nr_unused_locks;
-extern atomic_t nr_cyclic_checks;
-extern atomic_t nr_cyclic_check_recursions;
-extern atomic_t nr_find_usage_forwards_checks;
-extern atomic_t nr_find_usage_forwards_recursions;
-extern atomic_t nr_find_usage_backwards_checks;
-extern atomic_t nr_find_usage_backwards_recursions;
-# define debug_atomic_inc(ptr)		atomic_inc(ptr)
-# define debug_atomic_dec(ptr)		atomic_dec(ptr)
-# define debug_atomic_read(ptr)		atomic_read(ptr)
+struct lockdep_stats {
+	int	chain_lookup_hits;
+	int	chain_lookup_misses;
+	int	hardirqs_on_events;
+	int	hardirqs_off_events;
+	int	redundant_hardirqs_on;
+	int	redundant_hardirqs_off;
+	int	softirqs_on_events;
+	int	softirqs_off_events;
+	int	redundant_softirqs_on;
+	int	redundant_softirqs_off;
+	int	nr_unused_locks;
+	int	nr_cyclic_checks;
+	int	nr_cyclic_check_recursions;
+	int	nr_find_usage_forwards_checks;
+	int	nr_find_usage_forwards_recursions;
+	int	nr_find_usage_backwards_checks;
+	int	nr_find_usage_backwards_recursions;
+};
+
+DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
+
+#define __debug_atomic_inc(ptr)					\
+	this_cpu_inc(lockdep_stats.ptr);
+
+#define debug_atomic_inc(ptr)			{		\
+	WARN_ON_ONCE(!irqs_disabled());				\
+	__this_cpu_inc(lockdep_stats.ptr);			\
+}
+
+#define debug_atomic_dec(ptr)			{		\
+	WARN_ON_ONCE(!irqs_disabled());				\
+	__this_cpu_dec(lockdep_stats.ptr);			\
+}
+
+#define debug_atomic_read(ptr)		({				\
+	struct lockdep_stats *__cpu_lockdep_stats;			\
+	unsigned long long __total = 0;					\
+	int __cpu;							\
+	for_each_possible_cpu(__cpu) {					\
+		__cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu);	\
+		__total += __cpu_lockdep_stats->ptr;			\
+	}								\
+	__total;							\
+})
 #else
+# define __debug_atomic_inc(ptr)	do { } while (0)
 # define debug_atomic_inc(ptr)		do { } while (0)
 # define debug_atomic_dec(ptr)		do { } while (0)
 # define debug_atomic_read(ptr)		0
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584..59b76c8ce9d 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
 static void lockdep_stats_debug_show(struct seq_file *m)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
-	unsigned int hi1 = debug_atomic_read(&hardirqs_on_events),
-		     hi2 = debug_atomic_read(&hardirqs_off_events),
-		     hr1 = debug_atomic_read(&redundant_hardirqs_on),
-		     hr2 = debug_atomic_read(&redundant_hardirqs_off),
-		     si1 = debug_atomic_read(&softirqs_on_events),
-		     si2 = debug_atomic_read(&softirqs_off_events),
-		     sr1 = debug_atomic_read(&redundant_softirqs_on),
-		     sr2 = debug_atomic_read(&redundant_softirqs_off);
-
-	seq_printf(m, " chain lookup misses:           %11u\n",
-		debug_atomic_read(&chain_lookup_misses));
-	seq_printf(m, " chain lookup hits:             %11u\n",
-		debug_atomic_read(&chain_lookup_hits));
-	seq_printf(m, " cyclic checks:                 %11u\n",
-		debug_atomic_read(&nr_cyclic_checks));
-	seq_printf(m, " find-mask forwards checks:     %11u\n",
-		debug_atomic_read(&nr_find_usage_forwards_checks));
-	seq_printf(m, " find-mask backwards checks:    %11u\n",
-		debug_atomic_read(&nr_find_usage_backwards_checks));
-
-	seq_printf(m, " hardirq on events:             %11u\n", hi1);
-	seq_printf(m, " hardirq off events:            %11u\n", hi2);
-	seq_printf(m, " redundant hardirq ons:         %11u\n", hr1);
-	seq_printf(m, " redundant hardirq offs:        %11u\n", hr2);
-	seq_printf(m, " softirq on events:             %11u\n", si1);
-	seq_printf(m, " softirq off events:            %11u\n", si2);
-	seq_printf(m, " redundant softirq ons:         %11u\n", sr1);
-	seq_printf(m, " redundant softirq offs:        %11u\n", sr2);
+	unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
+			   hi2 = debug_atomic_read(hardirqs_off_events),
+			   hr1 = debug_atomic_read(redundant_hardirqs_on),
+			   hr2 = debug_atomic_read(redundant_hardirqs_off),
+			   si1 = debug_atomic_read(softirqs_on_events),
+			   si2 = debug_atomic_read(softirqs_off_events),
+			   sr1 = debug_atomic_read(redundant_softirqs_on),
+			   sr2 = debug_atomic_read(redundant_softirqs_off);
+
+	seq_printf(m, " chain lookup misses:           %11llu\n",
+		debug_atomic_read(chain_lookup_misses));
+	seq_printf(m, " chain lookup hits:             %11llu\n",
+		debug_atomic_read(chain_lookup_hits));
+	seq_printf(m, " cyclic checks:                 %11llu\n",
+		debug_atomic_read(nr_cyclic_checks));
+	seq_printf(m, " find-mask forwards checks:     %11llu\n",
+		debug_atomic_read(nr_find_usage_forwards_checks));
+	seq_printf(m, " find-mask backwards checks:    %11llu\n",
+		debug_atomic_read(nr_find_usage_backwards_checks));
+
+	seq_printf(m, " hardirq on events:             %11llu\n", hi1);
+	seq_printf(m, " hardirq off events:            %11llu\n", hi2);
+	seq_printf(m, " redundant hardirq ons:         %11llu\n", hr1);
+	seq_printf(m, " redundant hardirq offs:        %11llu\n", hr2);
+	seq_printf(m, " softirq on events:             %11llu\n", si1);
+	seq_printf(m, " softirq off events:            %11llu\n", si2);
+	seq_printf(m, " redundant softirq ons:         %11llu\n", sr1);
+	seq_printf(m, " redundant softirq offs:        %11llu\n", sr2);
 #endif
 }
 
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 #endif
 	}
 #ifdef CONFIG_DEBUG_LOCKDEP
-	DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
+	DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
 #endif
 	seq_printf(m, " lock-classes:                  %11lu [max: %lu]\n",
 			nr_lock_classes, MAX_LOCKDEP_KEYS);
diff --git a/kernel/module.c b/kernel/module.c
index 5daf0abd63c..ccd64199184 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,6 +1,6 @@
 /*
    Copyright (C) 2002 Richard Henderson
-   Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
+   Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -59,8 +59,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
 
-EXPORT_TRACEPOINT_SYMBOL(module_get);
-
 #if 0
 #define DEBUGP printk
 #else
@@ -74,11 +72,19 @@ EXPORT_TRACEPOINT_SYMBOL(module_get);
 /* If this is set, the section belongs in the init part of the module */
 #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
 
-/* List of modules, protected by module_mutex or preempt_disable
+/*
+ * Mutex protects:
+ * 1) List of modules (also safely readable with preempt_disable),
+ * 2) module_use links,
+ * 3) module_addr_min/module_addr_max.
  * (delete uses stop_machine/add uses RCU list operations). */
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
+#ifdef CONFIG_KGDB_KDB
+struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
+#endif /* CONFIG_KGDB_KDB */
+
 
 /* Block module loading/unloading? */
 int modules_disabled = 0;
@@ -88,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
-/* Bounds of module allocation, for speeding __module_address */
+/* Bounds of module allocation, for speeding __module_address.
+ * Protected by module_mutex. */
 static unsigned long module_addr_min = -1UL, module_addr_max = 0;
 
 int register_module_notifier(struct notifier_block * nb)
@@ -103,6 +110,20 @@ int unregister_module_notifier(struct notifier_block * nb)
 }
 EXPORT_SYMBOL(unregister_module_notifier);
 
+struct load_info {
+	Elf_Ehdr *hdr;
+	unsigned long len;
+	Elf_Shdr *sechdrs;
+	char *secstrings, *strtab;
+	unsigned long *strmap;
+	unsigned long symoffs, stroffs;
+	struct _ddebug *debug;
+	unsigned int num_debug;
+	struct {
+		unsigned int sym, str, mod, vers, info, pcpu;
+	} index;
+};
+
 /* We require a truly strong try_module_get(): 0 means failure due to
    ongoing or failed initialization etc. */
 static inline int strong_try_module_get(struct module *mod)
@@ -133,42 +154,38 @@ void __module_put_and_exit(struct module *mod, long code)
 EXPORT_SYMBOL(__module_put_and_exit);
 
 /* Find a module section: 0 means not found. */
-static unsigned int find_sec(Elf_Ehdr *hdr,
-			     Elf_Shdr *sechdrs,
-			     const char *secstrings,
-			     const char *name)
+static unsigned int find_sec(const struct load_info *info, const char *name)
 {
 	unsigned int i;
 
-	for (i = 1; i < hdr->e_shnum; i++)
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		Elf_Shdr *shdr = &info->sechdrs[i];
 		/* Alloc bit cleared means "ignore it." */
-		if ((sechdrs[i].sh_flags & SHF_ALLOC)
-		    && strcmp(secstrings+sechdrs[i].sh_name, name) == 0)
+		if ((shdr->sh_flags & SHF_ALLOC)
+		    && strcmp(info->secstrings + shdr->sh_name, name) == 0)
 			return i;
+	}
 	return 0;
 }
 
 /* Find a module section, or NULL. */
-static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs,
-			  const char *secstrings, const char *name)
+static void *section_addr(const struct load_info *info, const char *name)
 {
 	/* Section 0 has sh_addr 0. */
-	return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr;
+	return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
 }
 
 /* Find a module section, or NULL.  Fill in number of "objects" in section. */
-static void *section_objs(Elf_Ehdr *hdr,
-			  Elf_Shdr *sechdrs,
-			  const char *secstrings,
+static void *section_objs(const struct load_info *info,
 			  const char *name,
 			  size_t object_size,
 			  unsigned int *num)
 {
-	unsigned int sec = find_sec(hdr, sechdrs, secstrings, name);
+	unsigned int sec = find_sec(info, name);
 
 	/* Section 0 has sh_addr 0 and sh_size 0. */
-	*num = sechdrs[sec].sh_size / object_size;
-	return (void *)sechdrs[sec].sh_addr;
+	*num = info->sechdrs[sec].sh_size / object_size;
+	return (void *)info->sechdrs[sec].sh_addr;
 }
 
 /* Provided by the linker */
@@ -178,8 +195,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const struct kernel_symbol __start___ksymtab_gpl_future[];
-extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
 extern const unsigned long __start___kcrctab_gpl_future[];
@@ -222,7 +237,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
 			    unsigned int symnum, void *data), void *data)
 {
 	struct module *mod;
-	const struct symsearch arr[] = {
+	static const struct symsearch arr[] = {
 		{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
 		  NOT_GPL_ONLY, false },
 		{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
@@ -329,7 +344,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 }
 
 /* Find a symbol and return it, along with, (optional) crc and
- * (optional) module which owns it */
+ * (optional) module which owns it.  Needs preempt disabled or module_mutex. */
 const struct kernel_symbol *find_symbol(const char *name,
 					struct module **owner,
 					const unsigned long **crc,
@@ -370,67 +385,112 @@ EXPORT_SYMBOL_GPL(find_module);
 
 #ifdef CONFIG_SMP
 
-static void *percpu_modalloc(unsigned long size, unsigned long align,
-			     const char *name)
+static inline void __percpu *mod_percpu(struct module *mod)
 {
-	void *ptr;
+	return mod->percpu;
+}
 
+static int percpu_modalloc(struct module *mod,
+			   unsigned long size, unsigned long align)
+{
 	if (align > PAGE_SIZE) {
 		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
-		       name, align, PAGE_SIZE);
+		       mod->name, align, PAGE_SIZE);
 		align = PAGE_SIZE;
 	}
 
-	ptr = __alloc_reserved_percpu(size, align);
-	if (!ptr)
+	mod->percpu = __alloc_reserved_percpu(size, align);
+	if (!mod->percpu) {
 		printk(KERN_WARNING
-		       "Could not allocate %lu bytes percpu data\n", size);
-	return ptr;
+		       "%s: Could not allocate %lu bytes percpu data\n",
+		       mod->name, size);
+		return -ENOMEM;
+	}
+	mod->percpu_size = size;
+	return 0;
 }
 
-static void percpu_modfree(void *freeme)
+static void percpu_modfree(struct module *mod)
 {
-	free_percpu(freeme);
+	free_percpu(mod->percpu);
 }
 
-static unsigned int find_pcpusec(Elf_Ehdr *hdr,
-				 Elf_Shdr *sechdrs,
-				 const char *secstrings)
+static unsigned int find_pcpusec(struct load_info *info)
 {
-	return find_sec(hdr, sechdrs, secstrings, ".data..percpu");
+	return find_sec(info, ".data..percpu");
 }
 
-static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+static void percpu_modcopy(struct module *mod,
+			   const void *from, unsigned long size)
 {
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+		memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
+}
+
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+	struct module *mod;
+	unsigned int cpu;
+
+	preempt_disable();
+
+	list_for_each_entry_rcu(mod, &modules, list) {
+		if (!mod->percpu_size)
+			continue;
+		for_each_possible_cpu(cpu) {
+			void *start = per_cpu_ptr(mod->percpu, cpu);
+
+			if ((void *)addr >= start &&
+			    (void *)addr < start + mod->percpu_size) {
+				preempt_enable();
+				return true;
+			}
+		}
+	}
+
+	preempt_enable();
+	return false;
 }
 
 #else /* ... !CONFIG_SMP */
 
-static inline void *percpu_modalloc(unsigned long size, unsigned long align,
-				    const char *name)
+static inline void __percpu *mod_percpu(struct module *mod)
 {
 	return NULL;
 }
-static inline void percpu_modfree(void *pcpuptr)
+static inline int percpu_modalloc(struct module *mod,
+				  unsigned long size, unsigned long align)
 {
-	BUG();
+	return -ENOMEM;
 }
-static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
-					Elf_Shdr *sechdrs,
-					const char *secstrings)
+static inline void percpu_modfree(struct module *mod)
+{
+}
+static unsigned int find_pcpusec(struct load_info *info)
 {
 	return 0;
 }
-static inline void percpu_modcopy(void *pcpudst, const void *src,
-				  unsigned long size)
+static inline void percpu_modcopy(struct module *mod,
+				  const void *from, unsigned long size)
 {
 	/* pcpusec should be 0, and size of that section should be 0. */
 	BUG_ON(size != 0);
 }
+bool is_module_percpu_address(unsigned long addr)
+{
+	return false;
+}
 
 #endif /* CONFIG_SMP */
 
@@ -467,34 +527,34 @@ MODINFO_ATTR(srcversion);
 static char last_unloaded_module[MODULE_NAME_LEN+1];
 
 #ifdef CONFIG_MODULE_UNLOAD
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
+
 /* Init the unload section of the module. */
-static void module_unload_init(struct module *mod)
+static int module_unload_init(struct module *mod)
 {
-	int cpu;
+	mod->refptr = alloc_percpu(struct module_ref);
+	if (!mod->refptr)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&mod->source_list);
+	INIT_LIST_HEAD(&mod->target_list);
 
-	INIT_LIST_HEAD(&mod->modules_which_use_me);
-	for_each_possible_cpu(cpu)
-		local_set(__module_ref_addr(mod, cpu), 0);
 	/* Hold reference count during initialization. */
-	local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
+	__this_cpu_write(mod->refptr->incs, 1);
 	/* Backwards compatibility macros put refcount during init. */
 	mod->waiter = current;
-}
 
-/* modules using other modules */
-struct module_use
-{
-	struct list_head list;
-	struct module *module_which_uses;
-};
+	return 0;
+}
 
 /* Does a already use b? */
 static int already_uses(struct module *a, struct module *b)
 {
 	struct module_use *use;
 
-	list_for_each_entry(use, &b->modules_which_use_me, list) {
-		if (use->module_which_uses == a) {
+	list_for_each_entry(use, &b->source_list, source_list) {
+		if (use->source == a) {
 			DEBUGP("%s uses %s!\n", a->name, b->name);
 			return 1;
 		}
@@ -503,62 +563,70 @@ static int already_uses(struct module *a, struct module *b)
 	return 0;
 }
 
-/* Module a uses b */
-int use_module(struct module *a, struct module *b)
+/*
+ * Module a uses b
+ *  - we add 'a' as a "source", 'b' as a "target" of module use
+ *  - the module_use is added to the list of 'b' sources (so
+ *    'b' can walk the list to see who sourced them), and of 'a'
+ *    targets (so 'a' can see what modules it targets).
+ */
+static int add_module_usage(struct module *a, struct module *b)
 {
 	struct module_use *use;
-	int no_warn, err;
 
-	if (b == NULL || already_uses(a, b)) return 1;
+	DEBUGP("Allocating new usage for %s.\n", a->name);
+	use = kmalloc(sizeof(*use), GFP_ATOMIC);
+	if (!use) {
+		printk(KERN_WARNING "%s: out of memory loading\n", a->name);
+		return -ENOMEM;
+	}
+
+	use->source = a;
+	use->target = b;
+	list_add(&use->source_list, &b->source_list);
+	list_add(&use->target_list, &a->target_list);
+	return 0;
+}
+
+/* Module a uses b: caller needs module_mutex() */
+int ref_module(struct module *a, struct module *b)
+{
+	int err;
 
-	/* If we're interrupted or time out, we fail. */
-	if (wait_event_interruptible_timeout(
-		    module_wq, (err = strong_try_module_get(b)) != -EBUSY,
-		    30 * HZ) <= 0) {
-		printk("%s: gave up waiting for init of module %s.\n",
-		       a->name, b->name);
+	if (b == NULL || already_uses(a, b))
 		return 0;
-	}
 
-	/* If strong_try_module_get() returned a different error, we fail. */
+	/* If module isn't available, we fail. */
+	err = strong_try_module_get(b);
 	if (err)
-		return 0;
+		return err;
 
-	DEBUGP("Allocating new usage for %s.\n", a->name);
-	use = kmalloc(sizeof(*use), GFP_ATOMIC);
-	if (!use) {
-		printk("%s: out of memory loading\n", a->name);
+	err = add_module_usage(a, b);
+	if (err) {
 		module_put(b);
-		return 0;
+		return err;
 	}
-
-	use->module_which_uses = a;
-	list_add(&use->list, &b->modules_which_use_me);
-	no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
-	return 1;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(use_module);
+EXPORT_SYMBOL_GPL(ref_module);
 
 /* Clear the unload stuff of the module. */
 static void module_unload_free(struct module *mod)
 {
-	struct module *i;
-
-	list_for_each_entry(i, &modules, list) {
-		struct module_use *use;
+	struct module_use *use, *tmp;
 
-		list_for_each_entry(use, &i->modules_which_use_me, list) {
-			if (use->module_which_uses == mod) {
-				DEBUGP("%s unusing %s\n", mod->name, i->name);
-				module_put(i);
-				list_del(&use->list);
-				kfree(use);
-				sysfs_remove_link(i->holders_dir, mod->name);
-				/* There can be at most one match. */
-				break;
-			}
-		}
+	mutex_lock(&module_mutex);
+	list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
+		struct module *i = use->target;
+		DEBUGP("%s unusing %s\n", mod->name, i->name);
+		module_put(i);
+		list_del(&use->source_list);
+		list_del(&use->target_list);
+		kfree(use);
 	}
+	mutex_unlock(&module_mutex);
+
+	free_percpu(mod->refptr);
 }
 
 #ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -615,12 +683,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 
 unsigned int module_refcount(struct module *mod)
 {
-	unsigned int total = 0;
+	unsigned int incs = 0, decs = 0;
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		total += local_read(__module_ref_addr(mod, cpu));
-	return total;
+		decs += per_cpu_ptr(mod->refptr, cpu)->decs;
+	/*
+	 * ensure the incs are added up after the decs.
+	 * module_put ensures incs are visible before decs with smp_wmb.
+	 *
+	 * This 2-count scheme avoids the situation where the refcount
+	 * for CPU0 is read, then CPU0 increments the module refcount,
+	 * then CPU1 drops that refcount, then the refcount for CPU1 is
+	 * read. We would record a decrement but not its corresponding
+	 * increment so we would see a low count (disaster).
+	 *
+	 * Rare situation? But module_refcount can be preempted, and we
+	 * might be tallying up 4096+ CPUs. So it is not impossible.
+	 */
+	smp_rmb();
+	for_each_possible_cpu(cpu)
+		incs += per_cpu_ptr(mod->refptr, cpu)->incs;
+	return incs - decs;
 }
 EXPORT_SYMBOL(module_refcount);
 
@@ -656,16 +740,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		return -EFAULT;
 	name[MODULE_NAME_LEN-1] = '\0';
 
-	/* Create stop_machine threads since free_module relies on
-	 * a non-failing stop_machine call. */
-	ret = stop_machine_create();
-	if (ret)
-		return ret;
-
-	if (mutex_lock_interruptible(&module_mutex) != 0) {
-		ret = -EINTR;
-		goto out_stop;
-	}
+	if (mutex_lock_interruptible(&module_mutex) != 0)
+		return -EINTR;
 
 	mod = find_module(name);
 	if (!mod) {
@@ -673,7 +749,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		goto out;
 	}
 
-	if (!list_empty(&mod->modules_which_use_me)) {
+	if (!list_empty(&mod->source_list)) {
 		/* Other modules depend on us: get rid of them first. */
 		ret = -EWOULDBLOCK;
 		goto out;
@@ -717,16 +793,14 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_GOING, mod);
 	async_synchronize_full();
-	mutex_lock(&module_mutex);
+
 	/* Store the name of the last unloaded module for diagnostic purposes */
 	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
-	ddebug_remove_module(mod->name);
-	free_module(mod);
 
- out:
+	free_module(mod);
+	return 0;
+out:
 	mutex_unlock(&module_mutex);
-out_stop:
-	stop_machine_destroy();
 	return ret;
 }
 
@@ -739,9 +813,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
 
 	/* Always include a trailing , so userspace can differentiate
            between this and the old multi-field proc format. */
-	list_for_each_entry(use, &mod->modules_which_use_me, list) {
+	list_for_each_entry(use, &mod->source_list, source_list) {
 		printed_something = 1;
-		seq_printf(m, "%s,", use->module_which_uses->name);
+		seq_printf(m, "%s,", use->source->name);
 	}
 
 	if (mod->init != NULL && mod->exit == NULL) {
@@ -796,14 +870,15 @@ static struct module_attribute refcnt = {
 void module_put(struct module *module)
 {
 	if (module) {
-		unsigned int cpu = get_cpu();
-		local_dec(__module_ref_addr(module, cpu));
-		trace_module_put(module, _RET_IP_,
-				 local_read(__module_ref_addr(module, cpu)));
+		preempt_disable();
+		smp_wmb(); /* see comment in module_refcount */
+		__this_cpu_inc(module->refptr->decs);
+
+		trace_module_put(module, _RET_IP_);
 		/* Maybe they're waiting for us to drop reference? */
 		if (unlikely(!module_is_live(module)))
 			wake_up_process(module->waiter);
-		put_cpu();
+		preempt_enable();
 	}
 }
 EXPORT_SYMBOL(module_put);
@@ -819,14 +894,15 @@ static inline void module_unload_free(struct module *mod)
 {
 }
 
-int use_module(struct module *a, struct module *b)
+int ref_module(struct module *a, struct module *b)
 {
-	return strong_try_module_get(b) == 0;
+	return strong_try_module_get(b);
 }
-EXPORT_SYMBOL_GPL(use_module);
+EXPORT_SYMBOL_GPL(ref_module);
 
-static inline void module_unload_init(struct module *mod)
+static inline int module_unload_init(struct module *mod)
 {
+	return 0;
 }
 #endif /* CONFIG_MODULE_UNLOAD */
 
@@ -940,6 +1016,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 {
 	const unsigned long *crc;
 
+	/* Since this should be found in kernel (which can't be removed),
+	 * no locking is necessary. */
 	if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
 			 &crc, true, false))
 		BUG();
@@ -982,35 +1060,68 @@ static inline int same_magic(const char *amagic, const char *bmagic,
 }
 #endif /* CONFIG_MODVERSIONS */
 
-/* Resolve a symbol for this module.  I.e. if we find one, record usage.
-   Must be holding module_mutex. */
-static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
-						  unsigned int versindex,
+/* Resolve a symbol for this module.  I.e. if we find one, record usage. */
+static const struct kernel_symbol *resolve_symbol(struct module *mod,
+						  const struct load_info *info,
 						  const char *name,
-						  struct module *mod)
+						  char ownername[])
 {
 	struct module *owner;
 	const struct kernel_symbol *sym;
 	const unsigned long *crc;
+	int err;
 
+	mutex_lock(&module_mutex);
 	sym = find_symbol(name, &owner, &crc,
 			  !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
-	/* use_module can fail due to OOM,
-	   or module initialization or unloading */
-	if (sym) {
-		if (!check_version(sechdrs, versindex, name, mod, crc, owner)
-		    || !use_module(mod, owner))
-			sym = NULL;
+	if (!sym)
+		goto unlock;
+
+	if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
+			   owner)) {
+		sym = ERR_PTR(-EINVAL);
+		goto getname;
 	}
+
+	err = ref_module(mod, owner);
+	if (err) {
+		sym = ERR_PTR(err);
+		goto getname;
+	}
+
+getname:
+	/* We must make copy under the lock if we failed to get ref. */
+	strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
+unlock:
+	mutex_unlock(&module_mutex);
 	return sym;
 }
 
+static const struct kernel_symbol *
+resolve_symbol_wait(struct module *mod,
+		    const struct load_info *info,
+		    const char *name)
+{
+	const struct kernel_symbol *ksym;
+	char owner[MODULE_NAME_LEN];
+
+	if (wait_event_interruptible_timeout(module_wq,
+			!IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
+			|| PTR_ERR(ksym) != -EBUSY,
+					     30 * HZ) <= 0) {
+		printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
+		       mod->name, owner);
+	}
+	return ksym;
+}
+
 /*
  * /sys/module/foo/sections stuff
  * J. Corbet <corbet@lwn.net>
  */
-#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
+#ifdef CONFIG_SYSFS
 
+#ifdef CONFIG_KALLSYMS
 static inline bool sect_empty(const Elf_Shdr *sect)
 {
 	return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
@@ -1047,8 +1158,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
 	kfree(sect_attrs);
 }
 
-static void add_sect_attrs(struct module *mod, unsigned int nsect,
-		char *secstrings, Elf_Shdr *sechdrs)
+static void add_sect_attrs(struct module *mod, const struct load_info *info)
 {
 	unsigned int nloaded = 0, i, size[2];
 	struct module_sect_attrs *sect_attrs;
@@ -1056,8 +1166,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
 	struct attribute **gattr;
 
 	/* Count loaded sections and allocate structures */
-	for (i = 0; i < nsect; i++)
-		if (!sect_empty(&sechdrs[i]))
+	for (i = 0; i < info->hdr->e_shnum; i++)
+		if (!sect_empty(&info->sechdrs[i]))
 			nloaded++;
 	size[0] = ALIGN(sizeof(*sect_attrs)
 			+ nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1074,15 +1184,17 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
 	sect_attrs->nsections = 0;
 	sattr = &sect_attrs->attrs[0];
 	gattr = &sect_attrs->grp.attrs[0];
-	for (i = 0; i < nsect; i++) {
-		if (sect_empty(&sechdrs[i]))
+	for (i = 0; i < info->hdr->e_shnum; i++) {
+		Elf_Shdr *sec = &info->sechdrs[i];
+		if (sect_empty(sec))
 			continue;
-		sattr->address = sechdrs[i].sh_addr;
-		sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
+		sattr->address = sec->sh_addr;
+		sattr->name = kstrdup(info->secstrings + sec->sh_name,
 					GFP_KERNEL);
 		if (sattr->name == NULL)
 			goto out;
 		sect_attrs->nsections++;
+		sysfs_attr_init(&sattr->mattr.attr);
 		sattr->mattr.show = module_sect_show;
 		sattr->mattr.store = NULL;
 		sattr->mattr.attr.name = sattr->name;
@@ -1122,7 +1234,7 @@ struct module_notes_attrs {
 	struct bin_attribute attrs[0];
 };
 
-static ssize_t module_notes_read(struct kobject *kobj,
+static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
 				 struct bin_attribute *bin_attr,
 				 char *buf, loff_t pos, size_t count)
 {
@@ -1145,8 +1257,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
 	kfree(notes_attrs);
 }
 
-static void add_notes_attrs(struct module *mod, unsigned int nsect,
-			    char *secstrings, Elf_Shdr *sechdrs)
+static void add_notes_attrs(struct module *mod, const struct load_info *info)
 {
 	unsigned int notes, loaded, i;
 	struct module_notes_attrs *notes_attrs;
@@ -1158,9 +1269,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
 
 	/* Count notes sections and allocate structures.  */
 	notes = 0;
-	for (i = 0; i < nsect; i++)
-		if (!sect_empty(&sechdrs[i]) &&
-		    (sechdrs[i].sh_type == SHT_NOTE))
+	for (i = 0; i < info->hdr->e_shnum; i++)
+		if (!sect_empty(&info->sechdrs[i]) &&
+		    (info->sechdrs[i].sh_type == SHT_NOTE))
 			++notes;
 
 	if (notes == 0)
@@ -1174,14 +1285,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
 
 	notes_attrs->notes = notes;
 	nattr = &notes_attrs->attrs[0];
-	for (loaded = i = 0; i < nsect; ++i) {
-		if (sect_empty(&sechdrs[i]))
+	for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
+		if (sect_empty(&info->sechdrs[i]))
 			continue;
-		if (sechdrs[i].sh_type == SHT_NOTE) {
+		if (info->sechdrs[i].sh_type == SHT_NOTE) {
+			sysfs_bin_attr_init(nattr);
 			nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
 			nattr->attr.mode = S_IRUGO;
-			nattr->size = sechdrs[i].sh_size;
-			nattr->private = (void *) sechdrs[i].sh_addr;
+			nattr->size = info->sechdrs[i].sh_size;
+			nattr->private = (void *) info->sechdrs[i].sh_addr;
 			nattr->read = module_notes_read;
 			++nattr;
 		}
@@ -1212,8 +1324,8 @@ static void remove_notes_attrs(struct module *mod)
 
 #else
 
-static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
-		char *sectstrings, Elf_Shdr *sechdrs)
+static inline void add_sect_attrs(struct module *mod,
+				  const struct load_info *info)
 {
 }
 
@@ -1221,18 +1333,44 @@ static inline void remove_sect_attrs(struct module *mod)
 {
 }
 
-static inline void add_notes_attrs(struct module *mod, unsigned int nsect,
-				   char *sectstrings, Elf_Shdr *sechdrs)
+static inline void add_notes_attrs(struct module *mod,
+				   const struct load_info *info)
 {
 }
 
 static inline void remove_notes_attrs(struct module *mod)
 {
 }
+#endif /* CONFIG_KALLSYMS */
+
+static void add_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+	struct module_use *use;
+	int nowarn;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(use, &mod->target_list, target_list) {
+		nowarn = sysfs_create_link(use->target->holders_dir,
+					   &mod->mkobj.kobj, mod->name);
+	}
+	mutex_unlock(&module_mutex);
 #endif
+}
 
-#ifdef CONFIG_SYSFS
-int module_add_modinfo_attrs(struct module *mod)
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+	struct module_use *use;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(use, &mod->target_list, target_list)
+		sysfs_remove_link(use->target->holders_dir, mod->name);
+	mutex_unlock(&module_mutex);
+#endif
+}
+
+static int module_add_modinfo_attrs(struct module *mod)
 {
 	struct module_attribute *attr;
 	struct module_attribute *temp_attr;
@@ -1250,6 +1388,7 @@ int module_add_modinfo_attrs(struct module *mod)
 		if (!attr->test ||
 		    (attr->test && attr->test(mod))) {
 			memcpy(temp_attr, attr, sizeof(*temp_attr));
+			sysfs_attr_init(&temp_attr->attr);
 			error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
 			++temp_attr;
 		}
@@ -1257,7 +1396,7 @@ int module_add_modinfo_attrs(struct module *mod)
 	return error;
 }
 
-void module_remove_modinfo_attrs(struct module *mod)
+static void module_remove_modinfo_attrs(struct module *mod)
 {
 	struct module_attribute *attr;
 	int i;
@@ -1273,7 +1412,7 @@ void module_remove_modinfo_attrs(struct module *mod)
 	kfree(mod->modinfo_attrs);
 }
 
-int mod_sysfs_init(struct module *mod)
+static int mod_sysfs_init(struct module *mod)
 {
 	int err;
 	struct kobject *kobj;
@@ -1307,12 +1446,17 @@ out:
 	return err;
 }
 
-int mod_sysfs_setup(struct module *mod,
+static int mod_sysfs_setup(struct module *mod,
+			   const struct load_info *info,
 			   struct kernel_param *kparam,
 			   unsigned int num_params)
 {
 	int err;
 
+	err = mod_sysfs_init(mod);
+	if (err)
+		goto out;
+
 	mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
 	if (!mod->holders_dir) {
 		err = -ENOMEM;
@@ -1327,6 +1471,10 @@ int mod_sysfs_setup(struct module *mod,
 	if (err)
 		goto out_unreg_param;
 
+	add_usage_links(mod);
+	add_sect_attrs(mod, info);
+	add_notes_attrs(mod, info);
+
 	kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
 	return 0;
 
@@ -1336,24 +1484,44 @@ out_unreg_holders:
 	kobject_put(mod->holders_dir);
 out_unreg:
 	kobject_put(&mod->mkobj.kobj);
+out:
 	return err;
 }
 
 static void mod_sysfs_fini(struct module *mod)
 {
+	remove_notes_attrs(mod);
+	remove_sect_attrs(mod);
 	kobject_put(&mod->mkobj.kobj);
 }
 
-#else /* CONFIG_SYSFS */
+#else /* !CONFIG_SYSFS */
+
+static int mod_sysfs_setup(struct module *mod,
+			   const struct load_info *info,
+			   struct kernel_param *kparam,
+			   unsigned int num_params)
+{
+	return 0;
+}
 
 static void mod_sysfs_fini(struct module *mod)
 {
 }
 
+static void module_remove_modinfo_attrs(struct module *mod)
+{
+}
+
+static void del_usage_links(struct module *mod)
+{
+}
+
 #endif /* CONFIG_SYSFS */
 
-static void mod_kobject_remove(struct module *mod)
+static void mod_sysfs_teardown(struct module *mod)
 {
+	del_usage_links(mod);
 	module_remove_modinfo_attrs(mod);
 	module_param_sysfs_remove(mod);
 	kobject_put(mod->mkobj.drivers_dir);
@@ -1369,19 +1537,23 @@ static int __unlink_module(void *_mod)
 {
 	struct module *mod = _mod;
 	list_del(&mod->list);
+	module_bug_cleanup(mod);
 	return 0;
 }
 
-/* Free a module, remove from lists, etc (must hold module_mutex). */
+/* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
 	trace_module_free(mod);
 
 	/* Delete from various lists */
+	mutex_lock(&module_mutex);
 	stop_machine(__unlink_module, mod, NULL);
-	remove_notes_attrs(mod);
-	remove_sect_attrs(mod);
-	mod_kobject_remove(mod);
+	mutex_unlock(&module_mutex);
+	mod_sysfs_teardown(mod);
+
+	/* Remove dynamic debug info */
+	ddebug_remove_module(mod->name);
 
 	/* Arch-specific cleanup. */
 	module_arch_cleanup(mod);
@@ -1395,12 +1567,8 @@ static void free_module(struct module *mod)
 	/* This may be NULL, but that's OK */
 	module_free(mod, mod->module_init);
 	kfree(mod->args);
-	if (mod->percpu)
-		percpu_modfree(mod->percpu);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
-	if (mod->refptr)
-		percpu_modfree(mod->refptr);
-#endif
+	percpu_modfree(mod);
+
 	/* Free lock-classes: */
 	lockdep_free_key_range(mod->module_core, mod->core_size);
 
@@ -1430,6 +1598,8 @@ EXPORT_SYMBOL_GPL(__symbol_get);
 /*
  * Ensure that an exported symbol [global namespace] does not already exist
  * in the kernel or in some other module's exported symbol table.
+ *
+ * You must hold the module_mutex.
  */
 static int verify_export_symbols(struct module *mod)
 {
@@ -1464,25 +1634,23 @@ static int verify_export_symbols(struct module *mod)
 }
 
 /* Change all symbols so that st_value encodes the pointer directly. */
-static int simplify_symbols(Elf_Shdr *sechdrs,
-			    unsigned int symindex,
-			    const char *strtab,
-			    unsigned int versindex,
-			    unsigned int pcpuindex,
-			    struct module *mod)
-{
-	Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
+static int simplify_symbols(struct module *mod, const struct load_info *info)
+{
+	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+	Elf_Sym *sym = (void *)symsec->sh_addr;
 	unsigned long secbase;
-	unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
+	unsigned int i;
 	int ret = 0;
 	const struct kernel_symbol *ksym;
 
-	for (i = 1; i < n; i++) {
+	for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
+		const char *name = info->strtab + sym[i].st_name;
+
 		switch (sym[i].st_shndx) {
 		case SHN_COMMON:
 			/* We compiled with -fno-common.  These are not
 			   supposed to happen.  */
-			DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name);
+			DEBUGP("Common symbol: %s\n", name);
 			printk("%s: please compile with -fno-common\n",
 			       mod->name);
 			ret = -ENOEXEC;
@@ -1495,29 +1663,28 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 			break;
 
 		case SHN_UNDEF:
-			ksym = resolve_symbol(sechdrs, versindex,
-					      strtab + sym[i].st_name, mod);
+			ksym = resolve_symbol_wait(mod, info, name);
 			/* Ok if resolved.  */
-			if (ksym) {
+			if (ksym && !IS_ERR(ksym)) {
 				sym[i].st_value = ksym->value;
 				break;
 			}
 
 			/* Ok if weak.  */
-			if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
+			if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
 				break;
 
-			printk(KERN_WARNING "%s: Unknown symbol %s\n",
-			       mod->name, strtab + sym[i].st_name);
-			ret = -ENOENT;
+			printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
+			       mod->name, name, PTR_ERR(ksym));
+			ret = PTR_ERR(ksym) ?: -ENOENT;
 			break;
 
 		default:
 			/* Divert to percpu allocation if a percpu var. */
-			if (sym[i].st_shndx == pcpuindex)
-				secbase = (unsigned long)mod->percpu;
+			if (sym[i].st_shndx == info->index.pcpu)
+				secbase = (unsigned long)mod_percpu(mod);
 			else
-				secbase = sechdrs[sym[i].st_shndx].sh_addr;
+				secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
 			sym[i].st_value += secbase;
 			break;
 		}
@@ -1526,6 +1693,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 	return ret;
 }
 
+static int apply_relocations(struct module *mod, const struct load_info *info)
+{
+	unsigned int i;
+	int err = 0;
+
+	/* Now do relocations. */
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		unsigned int infosec = info->sechdrs[i].sh_info;
+
+		/* Not a valid relocation section? */
+		if (infosec >= info->hdr->e_shnum)
+			continue;
+
+		/* Don't bother with non-allocated sections */
+		if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
+			continue;
+
+		if (info->sechdrs[i].sh_type == SHT_REL)
+			err = apply_relocate(info->sechdrs, info->strtab,
+					     info->index.sym, i, mod);
+		else if (info->sechdrs[i].sh_type == SHT_RELA)
+			err = apply_relocate_add(info->sechdrs, info->strtab,
+						 info->index.sym, i, mod);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
 /* Additional bytes needed by arch in front of individual sections */
 unsigned int __weak arch_mod_section_prepend(struct module *mod,
 					     unsigned int section)
@@ -1550,10 +1746,7 @@ static long get_offset(struct module *mod, unsigned int *size,
    might -- code, read-only data, read-write data, small data.  Tally
    sizes, and place the offsets into sh_entsize fields: high bit means it
    belongs in init. */
-static void layout_sections(struct module *mod,
-			    const Elf_Ehdr *hdr,
-			    Elf_Shdr *sechdrs,
-			    const char *secstrings)
+static void layout_sections(struct module *mod, struct load_info *info)
 {
 	static unsigned long const masks[][2] = {
 		/* NOTE: all executable code must be the first section
@@ -1566,21 +1759,22 @@ static void layout_sections(struct module *mod,
 	};
 	unsigned int m, i;
 
-	for (i = 0; i < hdr->e_shnum; i++)
-		sechdrs[i].sh_entsize = ~0UL;
+	for (i = 0; i < info->hdr->e_shnum; i++)
+		info->sechdrs[i].sh_entsize = ~0UL;
 
 	DEBUGP("Core section allocation order:\n");
 	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
-		for (i = 0; i < hdr->e_shnum; ++i) {
-			Elf_Shdr *s = &sechdrs[i];
+		for (i = 0; i < info->hdr->e_shnum; ++i) {
+			Elf_Shdr *s = &info->sechdrs[i];
+			const char *sname = info->secstrings + s->sh_name;
 
 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
 			    || (s->sh_flags & masks[m][1])
 			    || s->sh_entsize != ~0UL
-			    || strstarts(secstrings + s->sh_name, ".init"))
+			    || strstarts(sname, ".init"))
 				continue;
 			s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
-			DEBUGP("\t%s\n", secstrings + s->sh_name);
+			DEBUGP("\t%s\n", name);
 		}
 		if (m == 0)
 			mod->core_text_size = mod->core_size;
@@ -1588,17 +1782,18 @@ static void layout_sections(struct module *mod,
 
 	DEBUGP("Init section allocation order:\n");
 	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
-		for (i = 0; i < hdr->e_shnum; ++i) {
-			Elf_Shdr *s = &sechdrs[i];
+		for (i = 0; i < info->hdr->e_shnum; ++i) {
+			Elf_Shdr *s = &info->sechdrs[i];
+			const char *sname = info->secstrings + s->sh_name;
 
 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
 			    || (s->sh_flags & masks[m][1])
 			    || s->sh_entsize != ~0UL
-			    || !strstarts(secstrings + s->sh_name, ".init"))
+			    || !strstarts(sname, ".init"))
 				continue;
 			s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
 					 | INIT_OFFSET_MASK);
-			DEBUGP("\t%s\n", secstrings + s->sh_name);
+			DEBUGP("\t%s\n", sname);
 		}
 		if (m == 0)
 			mod->init_text_size = mod->init_size;
@@ -1637,33 +1832,28 @@ static char *next_string(char *string, unsigned long *secsize)
 	return string;
 }
 
-static char *get_modinfo(Elf_Shdr *sechdrs,
-			 unsigned int info,
-			 const char *tag)
+static char *get_modinfo(struct load_info *info, const char *tag)
 {
 	char *p;
 	unsigned int taglen = strlen(tag);
-	unsigned long size = sechdrs[info].sh_size;
+	Elf_Shdr *infosec = &info->sechdrs[info->index.info];
+	unsigned long size = infosec->sh_size;
 
-	for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) {
+	for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
 		if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
 			return p + taglen + 1;
 	}
 	return NULL;
 }
 
-static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
-			  unsigned int infoindex)
+static void setup_modinfo(struct module *mod, struct load_info *info)
 {
 	struct module_attribute *attr;
 	int i;
 
 	for (i = 0; (attr = modinfo_attrs[i]); i++) {
 		if (attr->setup)
-			attr->setup(mod,
-				    get_modinfo(sechdrs,
-						infoindex,
-						attr->attr.name));
+			attr->setup(mod, get_modinfo(info, attr->attr.name));
 	}
 }
 
@@ -1704,11 +1894,10 @@ static int is_exported(const char *name, unsigned long value,
 }
 
 /* As per nm */
-static char elf_type(const Elf_Sym *sym,
-		     Elf_Shdr *sechdrs,
-		     const char *secstrings,
-		     struct module *mod)
+static char elf_type(const Elf_Sym *sym, const struct load_info *info)
 {
+	const Elf_Shdr *sechdrs = info->sechdrs;
+
 	if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
 		if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
 			return 'v';
@@ -1738,8 +1927,10 @@ static char elf_type(const Elf_Sym *sym,
 		else
 			return 'b';
 	}
-	if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
+	if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+		      ".debug")) {
 		return 'n';
+	}
 	return '?';
 }
 
@@ -1764,127 +1955,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
 	return true;
 }
 
-static unsigned long layout_symtab(struct module *mod,
-				   Elf_Shdr *sechdrs,
-				   unsigned int symindex,
-				   unsigned int strindex,
-				   const Elf_Ehdr *hdr,
-				   const char *secstrings,
-				   unsigned long *pstroffs,
-				   unsigned long *strmap)
+static void layout_symtab(struct module *mod, struct load_info *info)
 {
-	unsigned long symoffs;
-	Elf_Shdr *symsect = sechdrs + symindex;
-	Elf_Shdr *strsect = sechdrs + strindex;
+	Elf_Shdr *symsect = info->sechdrs + info->index.sym;
+	Elf_Shdr *strsect = info->sechdrs + info->index.str;
 	const Elf_Sym *src;
-	const char *strtab;
 	unsigned int i, nsrc, ndst;
 
 	/* Put symbol section at end of init part of module. */
 	symsect->sh_flags |= SHF_ALLOC;
 	symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
-					 symindex) | INIT_OFFSET_MASK;
-	DEBUGP("\t%s\n", secstrings + symsect->sh_name);
+					 info->index.sym) | INIT_OFFSET_MASK;
+	DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
 
-	src = (void *)hdr + symsect->sh_offset;
+	src = (void *)info->hdr + symsect->sh_offset;
 	nsrc = symsect->sh_size / sizeof(*src);
-	strtab = (void *)hdr + strsect->sh_offset;
 	for (ndst = i = 1; i < nsrc; ++i, ++src)
-		if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
+		if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
 			unsigned int j = src->st_name;
 
-			while(!__test_and_set_bit(j, strmap) && strtab[j])
+			while (!__test_and_set_bit(j, info->strmap)
+			       && info->strtab[j])
 				++j;
 			++ndst;
 		}
 
 	/* Append room for core symbols at end of core part. */
-	symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
-	mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
+	info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
+	mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
 
 	/* Put string table section at end of init part of module. */
 	strsect->sh_flags |= SHF_ALLOC;
 	strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
-					 strindex) | INIT_OFFSET_MASK;
-	DEBUGP("\t%s\n", secstrings + strsect->sh_name);
+					 info->index.str) | INIT_OFFSET_MASK;
+	DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
 
 	/* Append room for core symbols' strings at end of core part. */
-	*pstroffs = mod->core_size;
-	__set_bit(0, strmap);
-	mod->core_size += bitmap_weight(strmap, strsect->sh_size);
-
-	return symoffs;
+	info->stroffs = mod->core_size;
+	__set_bit(0, info->strmap);
+	mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
 }
 
-static void add_kallsyms(struct module *mod,
-			 Elf_Shdr *sechdrs,
-			 unsigned int shnum,
-			 unsigned int symindex,
-			 unsigned int strindex,
-			 unsigned long symoffs,
-			 unsigned long stroffs,
-			 const char *secstrings,
-			 unsigned long *strmap)
+static void add_kallsyms(struct module *mod, const struct load_info *info)
 {
 	unsigned int i, ndst;
 	const Elf_Sym *src;
 	Elf_Sym *dst;
 	char *s;
+	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
 
-	mod->symtab = (void *)sechdrs[symindex].sh_addr;
-	mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
-	mod->strtab = (void *)sechdrs[strindex].sh_addr;
+	mod->symtab = (void *)symsec->sh_addr;
+	mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+	/* Make sure we get permanent strtab: don't use info->strtab. */
+	mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
 
 	/* Set types up while we still have access to sections. */
 	for (i = 0; i < mod->num_symtab; i++)
-		mod->symtab[i].st_info
-			= elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
+		mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
 
-	mod->core_symtab = dst = mod->module_core + symoffs;
+	mod->core_symtab = dst = mod->module_core + info->symoffs;
 	src = mod->symtab;
 	*dst = *src;
 	for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
-		if (!is_core_symbol(src, sechdrs, shnum))
+		if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
 			continue;
 		dst[ndst] = *src;
-		dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
+		dst[ndst].st_name = bitmap_weight(info->strmap,
+						  dst[ndst].st_name);
 		++ndst;
 	}
 	mod->core_num_syms = ndst;
 
-	mod->core_strtab = s = mod->module_core + stroffs;
-	for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
-		if (test_bit(i, strmap))
+	mod->core_strtab = s = mod->module_core + info->stroffs;
+	for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
+		if (test_bit(i, info->strmap))
 			*++s = mod->strtab[i];
 }
 #else
-static inline unsigned long layout_symtab(struct module *mod,
-					  Elf_Shdr *sechdrs,
-					  unsigned int symindex,
-					  unsigned int strindex,
-					  const Elf_Ehdr *hdr,
-					  const char *secstrings,
-					  unsigned long *pstroffs,
-					  unsigned long *strmap)
+static inline void layout_symtab(struct module *mod, struct load_info *info)
 {
-	return 0;
 }
 
-static inline void add_kallsyms(struct module *mod,
-				Elf_Shdr *sechdrs,
-				unsigned int shnum,
-				unsigned int symindex,
-				unsigned int strindex,
-				unsigned long symoffs,
-				unsigned long stroffs,
-				const char *secstrings,
-				const unsigned long *strmap)
+static void add_kallsyms(struct module *mod, struct load_info *info)
 {
 }
 #endif /* CONFIG_KALLSYMS */
 
 static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
 {
+	if (!debug)
+		return;
 #ifdef CONFIG_DYNAMIC_DEBUG
 	if (ddebug_add_module(debug, num, debug->modname))
 		printk(KERN_ERR "dynamic debug error adding module: %s\n",
@@ -1892,77 +2052,70 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
 #endif
 }
 
+static void dynamic_debug_remove(struct _ddebug *debug)
+{
+	if (debug)
+		ddebug_remove_module(debug->modname);
+}
+
 static void *module_alloc_update_bounds(unsigned long size)
 {
 	void *ret = module_alloc(size);
 
 	if (ret) {
+		mutex_lock(&module_mutex);
 		/* Update module bounds. */
 		if ((unsigned long)ret < module_addr_min)
 			module_addr_min = (unsigned long)ret;
 		if ((unsigned long)ret + size > module_addr_max)
 			module_addr_max = (unsigned long)ret + size;
+		mutex_unlock(&module_mutex);
 	}
 	return ret;
 }
 
 #ifdef CONFIG_DEBUG_KMEMLEAK
-static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
-				 Elf_Shdr *sechdrs, char *secstrings)
+static void kmemleak_load_module(const struct module *mod,
+				 const struct load_info *info)
 {
 	unsigned int i;
 
 	/* only scan the sections containing data */
 	kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
 
-	for (i = 1; i < hdr->e_shnum; i++) {
-		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		const char *name = info->secstrings + info->sechdrs[i].sh_name;
+		if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
 			continue;
-		if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
-		    && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
+		if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
 			continue;
 
-		kmemleak_scan_area((void *)sechdrs[i].sh_addr,
-				   sechdrs[i].sh_size, GFP_KERNEL);
+		kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
+				   info->sechdrs[i].sh_size, GFP_KERNEL);
 	}
 }
 #else
-static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
-					Elf_Shdr *sechdrs, char *secstrings)
+static inline void kmemleak_load_module(const struct module *mod,
+					const struct load_info *info)
 {
 }
 #endif
 
-/* Allocate and load the module: note that size of section 0 is always
-   zero, and we rely on this for optional sections. */
-static noinline struct module *load_module(void __user *umod,
-				  unsigned long len,
-				  const char __user *uargs)
+/* Sets info->hdr and info->len. */
+static int copy_and_check(struct load_info *info,
+			  const void __user *umod, unsigned long len,
+			  const char __user *uargs)
 {
+	int err;
 	Elf_Ehdr *hdr;
-	Elf_Shdr *sechdrs;
-	char *secstrings, *args, *modmagic, *strtab = NULL;
-	char *staging;
-	unsigned int i;
-	unsigned int symindex = 0;
-	unsigned int strindex = 0;
-	unsigned int modindex, versindex, infoindex, pcpuindex;
-	struct module *mod;
-	long err = 0;
-	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
-	unsigned long symoffs, stroffs, *strmap;
-
-	mm_segment_t old_fs;
 
-	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
-	       umod, len, uargs);
 	if (len < sizeof(*hdr))
-		return ERR_PTR(-ENOEXEC);
+		return -ENOEXEC;
 
 	/* Suck in entire file: we'll want most of it. */
 	/* vmalloc barfs on "unusual" numbers.  Check here */
 	if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	if (copy_from_user(hdr, umod, len) != 0) {
 		err = -EFAULT;
@@ -1970,142 +2123,225 @@ static noinline struct module *load_module(void __user *umod,
 	}
 
 	/* Sanity checks against insmoding binaries or wrong arch,
-           weird elf version */
+	   weird elf version */
 	if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
 	    || hdr->e_type != ET_REL
 	    || !elf_check_arch(hdr)
-	    || hdr->e_shentsize != sizeof(*sechdrs)) {
+	    || hdr->e_shentsize != sizeof(Elf_Shdr)) {
 		err = -ENOEXEC;
 		goto free_hdr;
 	}
 
-	if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr))
-		goto truncated;
+	if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
+		err = -ENOEXEC;
+		goto free_hdr;
+	}
+
+	info->hdr = hdr;
+	info->len = len;
+	return 0;
+
+free_hdr:
+	vfree(hdr);
+	return err;
+}
+
+static void free_copy(struct load_info *info)
+{
+	vfree(info->hdr);
+}
 
-	/* Convenience variables */
-	sechdrs = (void *)hdr + hdr->e_shoff;
-	secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-	sechdrs[0].sh_addr = 0;
+static int rewrite_section_headers(struct load_info *info)
+{
+	unsigned int i;
 
-	for (i = 1; i < hdr->e_shnum; i++) {
-		if (sechdrs[i].sh_type != SHT_NOBITS
-		    && len < sechdrs[i].sh_offset + sechdrs[i].sh_size)
-			goto truncated;
+	/* This should always be true, but let's be sure. */
+	info->sechdrs[0].sh_addr = 0;
+
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		Elf_Shdr *shdr = &info->sechdrs[i];
+		if (shdr->sh_type != SHT_NOBITS
+		    && info->len < shdr->sh_offset + shdr->sh_size) {
+			printk(KERN_ERR "Module len %lu truncated\n",
+			       info->len);
+			return -ENOEXEC;
+		}
 
 		/* Mark all sections sh_addr with their address in the
 		   temporary image. */
-		sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset;
+		shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
 
-		/* Internal symbols and strings. */
-		if (sechdrs[i].sh_type == SHT_SYMTAB) {
-			symindex = i;
-			strindex = sechdrs[i].sh_link;
-			strtab = (char *)hdr + sechdrs[strindex].sh_offset;
-		}
 #ifndef CONFIG_MODULE_UNLOAD
 		/* Don't load .exit sections */
-		if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
-			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
+		if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
+			shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
 	}
 
-	modindex = find_sec(hdr, sechdrs, secstrings,
-			    ".gnu.linkonce.this_module");
-	if (!modindex) {
+	/* Track but don't keep modinfo and version sections. */
+	info->index.vers = find_sec(info, "__versions");
+	info->index.info = find_sec(info, ".modinfo");
+	info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
+	info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+	return 0;
+}
+
+/*
+ * Set up our basic convenience variables (pointers to section headers,
+ * search for module section index etc), and do some basic section
+ * verification.
+ *
+ * Return the temporary module pointer (we'll replace it with the final
+ * one when we move the module sections around).
+ */
+static struct module *setup_load_info(struct load_info *info)
+{
+	unsigned int i;
+	int err;
+	struct module *mod;
+
+	/* Set up the convenience variables */
+	info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
+	info->secstrings = (void *)info->hdr
+		+ info->sechdrs[info->hdr->e_shstrndx].sh_offset;
+
+	err = rewrite_section_headers(info);
+	if (err)
+		return ERR_PTR(err);
+
+	/* Find internal symbols and strings. */
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
+			info->index.sym = i;
+			info->index.str = info->sechdrs[i].sh_link;
+			info->strtab = (char *)info->hdr
+				+ info->sechdrs[info->index.str].sh_offset;
+			break;
+		}
+	}
+
+	info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
+	if (!info->index.mod) {
 		printk(KERN_WARNING "No module found in object\n");
-		err = -ENOEXEC;
-		goto free_hdr;
+		return ERR_PTR(-ENOEXEC);
 	}
 	/* This is temporary: point mod into copy of data. */
-	mod = (void *)sechdrs[modindex].sh_addr;
+	mod = (void *)info->sechdrs[info->index.mod].sh_addr;
 
-	if (symindex == 0) {
+	if (info->index.sym == 0) {
 		printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
 		       mod->name);
-		err = -ENOEXEC;
-		goto free_hdr;
+		return ERR_PTR(-ENOEXEC);
 	}
 
-	versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
-	infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
-	pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
-
-	/* Don't keep modinfo and version sections. */
-	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
-	sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+	info->index.pcpu = find_pcpusec(info);
 
 	/* Check module struct version now, before we try to use module. */
-	if (!check_modstruct_version(sechdrs, versindex, mod)) {
-		err = -ENOEXEC;
-		goto free_hdr;
-	}
+	if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
+		return ERR_PTR(-ENOEXEC);
+
+	return mod;
+}
+
+static int check_modinfo(struct module *mod, struct load_info *info)
+{
+	const char *modmagic = get_modinfo(info, "vermagic");
+	int err;
 
-	modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
 	/* This is allowed: modprobe --force will invalidate it. */
 	if (!modmagic) {
 		err = try_to_force_load(mod, "bad vermagic");
 		if (err)
-			goto free_hdr;
-	} else if (!same_magic(modmagic, vermagic, versindex)) {
+			return err;
+	} else if (!same_magic(modmagic, vermagic, info->index.vers)) {
 		printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
 		       mod->name, modmagic, vermagic);
-		err = -ENOEXEC;
-		goto free_hdr;
+		return -ENOEXEC;
 	}
 
-	staging = get_modinfo(sechdrs, infoindex, "staging");
-	if (staging) {
+	if (get_modinfo(info, "staging")) {
 		add_taint_module(mod, TAINT_CRAP);
 		printk(KERN_WARNING "%s: module is from the staging directory,"
 		       " the quality is unknown, you have been warned.\n",
 		       mod->name);
 	}
 
-	/* Now copy in args */
-	args = strndup_user(uargs, ~0UL >> 1);
-	if (IS_ERR(args)) {
-		err = PTR_ERR(args);
-		goto free_hdr;
-	}
+	/* Set up license info based on the info section */
+	set_license(mod, get_modinfo(info, "license"));
 
-	strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
-			 * sizeof(long), GFP_KERNEL);
-	if (!strmap) {
-		err = -ENOMEM;
-		goto free_mod;
-	}
+	return 0;
+}
 
-	if (find_module(mod->name)) {
-		err = -EEXIST;
-		goto free_mod;
-	}
+static void find_module_sections(struct module *mod, struct load_info *info)
+{
+	mod->kp = section_objs(info, "__param",
+			       sizeof(*mod->kp), &mod->num_kp);
+	mod->syms = section_objs(info, "__ksymtab",
+				 sizeof(*mod->syms), &mod->num_syms);
+	mod->crcs = section_addr(info, "__kcrctab");
+	mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
+				     sizeof(*mod->gpl_syms),
+				     &mod->num_gpl_syms);
+	mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
+	mod->gpl_future_syms = section_objs(info,
+					    "__ksymtab_gpl_future",
+					    sizeof(*mod->gpl_future_syms),
+					    &mod->num_gpl_future_syms);
+	mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
 
-	mod->state = MODULE_STATE_COMING;
+#ifdef CONFIG_UNUSED_SYMBOLS
+	mod->unused_syms = section_objs(info, "__ksymtab_unused",
+					sizeof(*mod->unused_syms),
+					&mod->num_unused_syms);
+	mod->unused_crcs = section_addr(info, "__kcrctab_unused");
+	mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
+					    sizeof(*mod->unused_gpl_syms),
+					    &mod->num_unused_gpl_syms);
+	mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
+#endif
+#ifdef CONFIG_CONSTRUCTORS
+	mod->ctors = section_objs(info, ".ctors",
+				  sizeof(*mod->ctors), &mod->num_ctors);
+#endif
 
-	/* Allow arches to frob section contents and sizes.  */
-	err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod);
-	if (err < 0)
-		goto free_mod;
+#ifdef CONFIG_TRACEPOINTS
+	mod->tracepoints = section_objs(info, "__tracepoints",
+					sizeof(*mod->tracepoints),
+					&mod->num_tracepoints);
+#endif
+#ifdef CONFIG_EVENT_TRACING
+	mod->trace_events = section_objs(info, "_ftrace_events",
+					 sizeof(*mod->trace_events),
+					 &mod->num_trace_events);
+	/*
+	 * This section contains pointers to allocated objects in the trace
+	 * code and not scanning it leads to false positives.
+	 */
+	kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
+			   mod->num_trace_events, GFP_KERNEL);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+	/* sechdrs[0].sh_size is always zero */
+	mod->ftrace_callsites = section_objs(info, "__mcount_loc",
+					     sizeof(*mod->ftrace_callsites),
+					     &mod->num_ftrace_callsites);
+#endif
 
-	if (pcpuindex) {
-		/* We have a special allocation for this section. */
-		percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
-					 sechdrs[pcpuindex].sh_addralign,
-					 mod->name);
-		if (!percpu) {
-			err = -ENOMEM;
-			goto free_mod;
-		}
-		sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
-		mod->percpu = percpu;
-	}
+	mod->extable = section_objs(info, "__ex_table",
+				    sizeof(*mod->extable), &mod->num_exentries);
 
-	/* Determine total sizes, and put offsets in sh_entsize.  For now
-	   this is done generically; there doesn't appear to be any
-	   special cases for the architectures. */
-	layout_sections(mod, hdr, sechdrs, secstrings);
-	symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
-				secstrings, &stroffs, strmap);
+	if (section_addr(info, "__obsparm"))
+		printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
+		       mod->name);
+
+	info->debug = section_objs(info, "__verbose",
+				   sizeof(*info->debug), &info->num_debug);
+}
+
+static int move_module(struct module *mod, struct load_info *info)
+{
+	int i;
+	void *ptr;
 
 	/* Do the allocs. */
 	ptr = module_alloc_update_bounds(mod->core_size);
@@ -2115,10 +2351,9 @@ static noinline struct module *load_module(void __user *umod,
 	 * leak.
 	 */
 	kmemleak_not_leak(ptr);
-	if (!ptr) {
-		err = -ENOMEM;
-		goto free_percpu;
-	}
+	if (!ptr)
+		return -ENOMEM;
+
 	memset(ptr, 0, mod->core_size);
 	mod->module_core = ptr;
 
@@ -2131,56 +2366,40 @@ static noinline struct module *load_module(void __user *umod,
 	 */
 	kmemleak_ignore(ptr);
 	if (!ptr && mod->init_size) {
-		err = -ENOMEM;
-		goto free_core;
+		module_free(mod, mod->module_core);
+		return -ENOMEM;
 	}
 	memset(ptr, 0, mod->init_size);
 	mod->module_init = ptr;
 
 	/* Transfer each section which specifies SHF_ALLOC */
 	DEBUGP("final section addresses:\n");
-	for (i = 0; i < hdr->e_shnum; i++) {
+	for (i = 0; i < info->hdr->e_shnum; i++) {
 		void *dest;
+		Elf_Shdr *shdr = &info->sechdrs[i];
 
-		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+		if (!(shdr->sh_flags & SHF_ALLOC))
 			continue;
 
-		if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK)
+		if (shdr->sh_entsize & INIT_OFFSET_MASK)
 			dest = mod->module_init
-				+ (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK);
+				+ (shdr->sh_entsize & ~INIT_OFFSET_MASK);
 		else
-			dest = mod->module_core + sechdrs[i].sh_entsize;
+			dest = mod->module_core + shdr->sh_entsize;
 
-		if (sechdrs[i].sh_type != SHT_NOBITS)
-			memcpy(dest, (void *)sechdrs[i].sh_addr,
-			       sechdrs[i].sh_size);
+		if (shdr->sh_type != SHT_NOBITS)
+			memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
 		/* Update sh_addr to point to copy in image. */
-		sechdrs[i].sh_addr = (unsigned long)dest;
-		DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name);
+		shdr->sh_addr = (unsigned long)dest;
+		DEBUGP("\t0x%lx %s\n",
+		       shdr->sh_addr, info->secstrings + shdr->sh_name);
 	}
-	/* Module has been moved. */
-	mod = (void *)sechdrs[modindex].sh_addr;
-	kmemleak_load_module(mod, hdr, sechdrs, secstrings);
 
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
-	mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
-				      mod->name);
-	if (!mod->refptr) {
-		err = -ENOMEM;
-		goto free_init;
-	}
-#endif
-	/* Now we've moved module, initialize linked lists, etc. */
-	module_unload_init(mod);
-
-	/* add kobject, so we can reference it. */
-	err = mod_sysfs_init(mod);
-	if (err)
-		goto free_unload;
-
-	/* Set up license info based on the info section */
-	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
+	return 0;
+}
 
+static int check_module_license_and_versions(struct module *mod)
+{
 	/*
 	 * ndiswrapper is under GPL by itself, but loads proprietary modules.
 	 * Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -2193,77 +2412,6 @@ static noinline struct module *load_module(void __user *umod,
 	if (strcmp(mod->name, "driverloader") == 0)
 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 
-	/* Set up MODINFO_ATTR fields */
-	setup_modinfo(mod, sechdrs, infoindex);
-
-	/* Fix up syms, so that st_value is a pointer to location. */
-	err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
-			       mod);
-	if (err < 0)
-		goto cleanup;
-
-	/* Now we've got everything in the final locations, we can
-	 * find optional sections. */
-	mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
-			       sizeof(*mod->kp), &mod->num_kp);
-	mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
-				 sizeof(*mod->syms), &mod->num_syms);
-	mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
-	mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
-				     sizeof(*mod->gpl_syms),
-				     &mod->num_gpl_syms);
-	mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
-	mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
-					    "__ksymtab_gpl_future",
-					    sizeof(*mod->gpl_future_syms),
-					    &mod->num_gpl_future_syms);
-	mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
-					    "__kcrctab_gpl_future");
-
-#ifdef CONFIG_UNUSED_SYMBOLS
-	mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
-					"__ksymtab_unused",
-					sizeof(*mod->unused_syms),
-					&mod->num_unused_syms);
-	mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
-					"__kcrctab_unused");
-	mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
-					    "__ksymtab_unused_gpl",
-					    sizeof(*mod->unused_gpl_syms),
-					    &mod->num_unused_gpl_syms);
-	mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
-					    "__kcrctab_unused_gpl");
-#endif
-#ifdef CONFIG_CONSTRUCTORS
-	mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
-				  sizeof(*mod->ctors), &mod->num_ctors);
-#endif
-
-#ifdef CONFIG_TRACEPOINTS
-	mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
-					"__tracepoints",
-					sizeof(*mod->tracepoints),
-					&mod->num_tracepoints);
-#endif
-#ifdef CONFIG_EVENT_TRACING
-	mod->trace_events = section_objs(hdr, sechdrs, secstrings,
-					 "_ftrace_events",
-					 sizeof(*mod->trace_events),
-					 &mod->num_trace_events);
-	/*
-	 * This section contains pointers to allocated objects in the trace
-	 * code and not scanning it leads to false positives.
-	 */
-	kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
-			   mod->num_trace_events, GFP_KERNEL);
-#endif
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
-	/* sechdrs[0].sh_size is always zero */
-	mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
-					     "__mcount_loc",
-					     sizeof(*mod->ftrace_callsites),
-					     &mod->num_ftrace_callsites);
-#endif
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2273,67 +2421,16 @@ static noinline struct module *load_module(void __user *umod,
 	    || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
 #endif
 		) {
-		err = try_to_force_load(mod,
-					"no versions for exported symbols");
-		if (err)
-			goto cleanup;
+		return try_to_force_load(mod,
+					 "no versions for exported symbols");
 	}
 #endif
+	return 0;
+}
 
-	/* Now do relocations. */
-	for (i = 1; i < hdr->e_shnum; i++) {
-		const char *strtab = (char *)sechdrs[strindex].sh_addr;
-		unsigned int info = sechdrs[i].sh_info;
-
-		/* Not a valid relocation section? */
-		if (info >= hdr->e_shnum)
-			continue;
-
-		/* Don't bother with non-allocated sections */
-		if (!(sechdrs[info].sh_flags & SHF_ALLOC))
-			continue;
-
-		if (sechdrs[i].sh_type == SHT_REL)
-			err = apply_relocate(sechdrs, strtab, symindex, i,mod);
-		else if (sechdrs[i].sh_type == SHT_RELA)
-			err = apply_relocate_add(sechdrs, strtab, symindex, i,
-						 mod);
-		if (err < 0)
-			goto cleanup;
-	}
-
-        /* Find duplicate symbols */
-	err = verify_export_symbols(mod);
-	if (err < 0)
-		goto cleanup;
-
-  	/* Set up and sort exception table */
-	mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
-				    sizeof(*mod->extable), &mod->num_exentries);
-	sort_extable(mod->extable, mod->extable + mod->num_exentries);
-
-	/* Finally, copy percpu area over. */
-	percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
-		       sechdrs[pcpuindex].sh_size);
-
-	add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
-		     symoffs, stroffs, secstrings, strmap);
-	kfree(strmap);
-	strmap = NULL;
-
-	if (!mod->taints) {
-		struct _ddebug *debug;
-		unsigned int num_debug;
-
-		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
-				     sizeof(*debug), &num_debug);
-		if (debug)
-			dynamic_debug_setup(debug, num_debug);
-	}
-
-	err = module_finalize(hdr, sechdrs, mod);
-	if (err < 0)
-		goto cleanup;
+static void flush_module_icache(const struct module *mod)
+{
+	mm_segment_t old_fs;
 
 	/* flush the icache in correct context */
 	old_fs = get_fs();
@@ -2352,11 +2449,160 @@ static noinline struct module *load_module(void __user *umod,
 			   (unsigned long)mod->module_core + mod->core_size);
 
 	set_fs(old_fs);
+}
 
-	mod->args = args;
-	if (section_addr(hdr, sechdrs, secstrings, "__obsparm"))
-		printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
-		       mod->name);
+static struct module *layout_and_allocate(struct load_info *info)
+{
+	/* Module within temporary copy. */
+	struct module *mod;
+	Elf_Shdr *pcpusec;
+	int err;
+
+	mod = setup_load_info(info);
+	if (IS_ERR(mod))
+		return mod;
+
+	err = check_modinfo(mod, info);
+	if (err)
+		return ERR_PTR(err);
+
+	/* Allow arches to frob section contents and sizes.  */
+	err = module_frob_arch_sections(info->hdr, info->sechdrs,
+					info->secstrings, mod);
+	if (err < 0)
+		goto out;
+
+	pcpusec = &info->sechdrs[info->index.pcpu];
+	if (pcpusec->sh_size) {
+		/* We have a special allocation for this section. */
+		err = percpu_modalloc(mod,
+				      pcpusec->sh_size, pcpusec->sh_addralign);
+		if (err)
+			goto out;
+		pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
+	}
+
+	/* Determine total sizes, and put offsets in sh_entsize.  For now
+	   this is done generically; there doesn't appear to be any
+	   special cases for the architectures. */
+	layout_sections(mod, info);
+
+	info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
+			 * sizeof(long), GFP_KERNEL);
+	if (!info->strmap) {
+		err = -ENOMEM;
+		goto free_percpu;
+	}
+	layout_symtab(mod, info);
+
+	/* Allocate and move to the final place */
+	err = move_module(mod, info);
+	if (err)
+		goto free_strmap;
+
+	/* Module has been copied to its final place now: return it. */
+	mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+	kmemleak_load_module(mod, info);
+	return mod;
+
+free_strmap:
+	kfree(info->strmap);
+free_percpu:
+	percpu_modfree(mod);
+out:
+	return ERR_PTR(err);
+}
+
+/* mod is no longer valid after this! */
+static void module_deallocate(struct module *mod, struct load_info *info)
+{
+	kfree(info->strmap);
+	percpu_modfree(mod);
+	module_free(mod, mod->module_init);
+	module_free(mod, mod->module_core);
+}
+
+static int post_relocation(struct module *mod, const struct load_info *info)
+{
+	/* Sort exception table now relocations are done. */
+	sort_extable(mod->extable, mod->extable + mod->num_exentries);
+
+	/* Copy relocated percpu area over. */
+	percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
+		       info->sechdrs[info->index.pcpu].sh_size);
+
+	/* Setup kallsyms-specific fields. */
+	add_kallsyms(mod, info);
+
+	/* Arch-specific module finalizing. */
+	return module_finalize(info->hdr, info->sechdrs, mod);
+}
+
+/* Allocate and load the module: note that size of section 0 is always
+   zero, and we rely on this for optional sections. */
+static struct module *load_module(void __user *umod,
+				  unsigned long len,
+				  const char __user *uargs)
+{
+	struct load_info info = { NULL, };
+	struct module *mod;
+	long err;
+
+	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+	       umod, len, uargs);
+
+	/* Copy in the blobs from userspace, check they are vaguely sane. */
+	err = copy_and_check(&info, umod, len, uargs);
+	if (err)
+		return ERR_PTR(err);
+
+	/* Figure out module layout, and allocate all the memory. */
+	mod = layout_and_allocate(&info);
+	if (IS_ERR(mod)) {
+		err = PTR_ERR(mod);
+		goto free_copy;
+	}
+
+	/* Now module is in final location, initialize linked lists, etc. */
+	err = module_unload_init(mod);
+	if (err)
+		goto free_module;
+
+	/* Now we've got everything in the final locations, we can
+	 * find optional sections. */
+	find_module_sections(mod, &info);
+
+	err = check_module_license_and_versions(mod);
+	if (err)
+		goto free_unload;
+
+	/* Set up MODINFO_ATTR fields */
+	setup_modinfo(mod, &info);
+
+	/* Fix up syms, so that st_value is a pointer to location. */
+	err = simplify_symbols(mod, &info);
+	if (err < 0)
+		goto free_modinfo;
+
+	err = apply_relocations(mod, &info);
+	if (err < 0)
+		goto free_modinfo;
+
+	err = post_relocation(mod, &info);
+	if (err < 0)
+		goto free_modinfo;
+
+	flush_module_icache(mod);
+
+	/* Now copy in args */
+	mod->args = strndup_user(uargs, ~0UL >> 1);
+	if (IS_ERR(mod->args)) {
+		err = PTR_ERR(mod->args);
+		goto free_arch_cleanup;
+	}
+
+	/* Mark state as coming so strong_try_module_get() ignores us. */
+	mod->state = MODULE_STATE_COMING;
 
 	/* Now sew it into the lists so we can get lockdep and oops
 	 * info during argument parsing.  Noone should access us, since
@@ -2365,59 +2611,67 @@ static noinline struct module *load_module(void __user *umod,
 	 * function to insert in a way safe to concurrent readers.
 	 * The mutex protects against concurrent writers.
 	 */
+	mutex_lock(&module_mutex);
+	if (find_module(mod->name)) {
+		err = -EEXIST;
+		goto unlock;
+	}
+
+	/* This has to be done once we're sure module name is unique. */
+	if (!mod->taints)
+		dynamic_debug_setup(info.debug, info.num_debug);
+
+	/* Find duplicate symbols */
+	err = verify_export_symbols(mod);
+	if (err < 0)
+		goto ddebug;
+
+	module_bug_finalize(info.hdr, info.sechdrs, mod);
 	list_add_rcu(&mod->list, &modules);
+	mutex_unlock(&module_mutex);
 
+	/* Module is ready to execute: parsing args may do that. */
 	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
 	if (err < 0)
 		goto unlink;
 
-	err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
+	/* Link in to syfs. */
+	err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
 	if (err < 0)
 		goto unlink;
-	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
-	add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 
-	/* Get rid of temporary copy */
-	vfree(hdr);
-
-	trace_module_load(mod);
+	/* Get rid of temporary copy and strmap. */
+	kfree(info.strmap);
+	free_copy(&info);
 
 	/* Done! */
+	trace_module_load(mod);
 	return mod;
 
  unlink:
+	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+	module_bug_cleanup(mod);
+
+ ddebug:
+	if (!mod->taints)
+		dynamic_debug_remove(info.debug);
+ unlock:
+	mutex_unlock(&module_mutex);
 	synchronize_sched();
+	kfree(mod->args);
+ free_arch_cleanup:
 	module_arch_cleanup(mod);
- cleanup:
+ free_modinfo:
 	free_modinfo(mod);
-	kobject_del(&mod->mkobj.kobj);
-	kobject_put(&mod->mkobj.kobj);
  free_unload:
 	module_unload_free(mod);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
-	percpu_modfree(mod->refptr);
- free_init:
-#endif
-	module_free(mod, mod->module_init);
- free_core:
-	module_free(mod, mod->module_core);
-	/* mod will be freed with core. Don't access it beyond this line! */
- free_percpu:
-	if (percpu)
-		percpu_modfree(percpu);
- free_mod:
-	kfree(args);
-	kfree(strmap);
- free_hdr:
-	vfree(hdr);
+ free_module:
+	module_deallocate(mod, &info);
+ free_copy:
+	free_copy(&info);
 	return ERR_PTR(err);
-
- truncated:
-	printk(KERN_ERR "Module len %lu truncated\n", len);
-	err = -ENOEXEC;
-	goto free_hdr;
 }
 
 /* Call module constructors. */
@@ -2442,19 +2696,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	if (!capable(CAP_SYS_MODULE) || modules_disabled)
 		return -EPERM;
 
-	/* Only one module load at a time, please */
-	if (mutex_lock_interruptible(&module_mutex) != 0)
-		return -EINTR;
-
 	/* Do all the hard work */
 	mod = load_module(umod, len, uargs);
-	if (IS_ERR(mod)) {
-		mutex_unlock(&module_mutex);
+	if (IS_ERR(mod))
 		return PTR_ERR(mod);
-	}
-
-	/* Drop lock so they can recurse */
-	mutex_unlock(&module_mutex);
 
 	blocking_notifier_call_chain(&module_notify_list,
 			MODULE_STATE_COMING, mod);
@@ -2471,9 +2716,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 		module_put(mod);
 		blocking_notifier_call_chain(&module_notify_list,
 					     MODULE_STATE_GOING, mod);
-		mutex_lock(&module_mutex);
 		free_module(mod);
-		mutex_unlock(&module_mutex);
 		wake_up(&module_wq);
 		return ret;
 	}
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 632f04c57d8..200407c1502 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -36,15 +36,6 @@
 # include <asm/mutex.h>
 #endif
 
-/***
- * mutex_init - initialize the mutex
- * @lock: the mutex to be initialized
- * @key: the lock_class_key for the class; used by mutex lock debugging
- *
- * Initialize the mutex to unlocked state.
- *
- * It is not allowed to initialize an already locked mutex.
- */
 void
 __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 {
@@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
 static __used noinline void __sched
 __mutex_lock_slowpath(atomic_t *lock_count);
 
-/***
+/**
  * mutex_lock - acquire the mutex
  * @lock: the mutex to be acquired
  *
@@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock);
 
 static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
 
-/***
+/**
  * mutex_unlock - release the mutex
  * @lock: the mutex to be released
  *
@@ -172,6 +163,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		struct thread_info *owner;
 
 		/*
+		 * If we own the BKL, then don't spin. The owner of
+		 * the mutex might be waiting on us to release the BKL.
+		 */
+		if (unlikely(current->lock_depth >= 0))
+			break;
+
+		/*
 		 * If there's an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
@@ -357,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count);
 static noinline int __sched
 __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
 
-/***
- * mutex_lock_interruptible - acquire the mutex, interruptable
+/**
+ * mutex_lock_interruptible - acquire the mutex, interruptible
  * @lock: the mutex to be acquired
  *
  * Lock the mutex like mutex_lock(), and return 0 if the mutex has
@@ -449,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
 	return prev == 1;
 }
 
-/***
- * mutex_trylock - try acquire the mutex, without waiting
+/**
+ * mutex_trylock - try to acquire the mutex, without waiting
  * @lock: the mutex to be acquired
  *
  * Try to acquire the mutex atomically. Returns 1 if the mutex
  * has been acquired successfully, and 0 on contention.
  *
  * NOTE: this function follows the spin_trylock() convention, so
- * it is negated to the down_trylock() return values! Be careful
+ * it is negated from the down_trylock() return values! Be careful
  * about this when converting semaphore users to mutexes.
  *
  * This function must not be used in interrupt context. The
diff --git a/kernel/notifier.c b/kernel/notifier.c
index acd24e7643e..2488ba7eb56 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
 	int ret = NOTIFY_DONE;
 	struct notifier_block *nb, *next_nb;
 
-	nb = rcu_dereference(*nl);
+	nb = rcu_dereference_raw(*nl);
 
 	while (nb && nr_to_call) {
-		next_nb = rcu_dereference(nb->next);
+		next_nb = rcu_dereference_raw(nb->next);
 
 #ifdef CONFIG_DEBUG_NOTIFIERS
 		if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
 	 * racy then it does not matter what the result of the test
 	 * is, we re-check the list after having taken the lock anyway:
 	 */
-	if (rcu_dereference(nh->head)) {
+	if (rcu_dereference_raw(nh->head)) {
 		down_read(&nh->rwsem);
 		ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
 					nr_calls);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b..f74e6c00e26 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -13,6 +13,7 @@
  *             Pavel Emelianov <xemul@openvz.org>
  */
 
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
@@ -24,7 +25,18 @@
 
 static struct kmem_cache *nsproxy_cachep;
 
-struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+struct nsproxy init_nsproxy = {
+	.count	= ATOMIC_INIT(1),
+	.uts_ns	= &init_uts_ns,
+#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
+	.ipc_ns	= &init_ipc_ns,
+#endif
+	.mnt_ns	= NULL,
+	.pid_ns	= &init_pid_ns,
+#ifdef CONFIG_NET
+	.net_ns	= &init_net,
+#endif
+};
 
 static inline struct nsproxy *create_nsproxy(void)
 {
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 00000000000..751019415d2
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,1135 @@
+/*
+ * padata.c - generic interface to process data streams in parallel
+ *
+ * Copyright (C) 2008, 2009 secunet Security Networks AG
+ * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/cpu.h>
+#include <linux/padata.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/rcupdate.h>
+
+#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
+#define MAX_OBJ_NUM 1000
+
+static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
+{
+	int cpu, target_cpu;
+
+	target_cpu = cpumask_first(pd->cpumask.pcpu);
+	for (cpu = 0; cpu < cpu_index; cpu++)
+		target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
+
+	return target_cpu;
+}
+
+static int padata_cpu_hash(struct padata_priv *padata)
+{
+	int cpu_index;
+	struct parallel_data *pd;
+
+	pd =  padata->pd;
+
+	/*
+	 * Hash the sequence numbers to the cpus by taking
+	 * seq_nr mod. number of cpus in use.
+	 */
+	cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+
+	return padata_index_to_cpu(pd, cpu_index);
+}
+
+static void padata_parallel_worker(struct work_struct *parallel_work)
+{
+	struct padata_parallel_queue *pqueue;
+	struct parallel_data *pd;
+	struct padata_instance *pinst;
+	LIST_HEAD(local_list);
+
+	local_bh_disable();
+	pqueue = container_of(parallel_work,
+			      struct padata_parallel_queue, work);
+	pd = pqueue->pd;
+	pinst = pd->pinst;
+
+	spin_lock(&pqueue->parallel.lock);
+	list_replace_init(&pqueue->parallel.list, &local_list);
+	spin_unlock(&pqueue->parallel.lock);
+
+	while (!list_empty(&local_list)) {
+		struct padata_priv *padata;
+
+		padata = list_entry(local_list.next,
+				    struct padata_priv, list);
+
+		list_del_init(&padata->list);
+
+		padata->parallel(padata);
+	}
+
+	local_bh_enable();
+}
+
+/**
+ * padata_do_parallel - padata parallelization function
+ *
+ * @pinst: padata instance
+ * @padata: object to be parallelized
+ * @cb_cpu: cpu the serialization callback function will run on,
+ *          must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
+ *
+ * The parallelization callback function will run with BHs off.
+ * Note: Every object which is parallelized by padata_do_parallel
+ * must be seen by padata_do_serial.
+ */
+int padata_do_parallel(struct padata_instance *pinst,
+		       struct padata_priv *padata, int cb_cpu)
+{
+	int target_cpu, err;
+	struct padata_parallel_queue *queue;
+	struct parallel_data *pd;
+
+	rcu_read_lock_bh();
+
+	pd = rcu_dereference(pinst->pd);
+
+	err = -EINVAL;
+	if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
+		goto out;
+
+	if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
+		goto out;
+
+	err =  -EBUSY;
+	if ((pinst->flags & PADATA_RESET))
+		goto out;
+
+	if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
+		goto out;
+
+	err = 0;
+	atomic_inc(&pd->refcnt);
+	padata->pd = pd;
+	padata->cb_cpu = cb_cpu;
+
+	if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
+		atomic_set(&pd->seq_nr, -1);
+
+	padata->seq_nr = atomic_inc_return(&pd->seq_nr);
+
+	target_cpu = padata_cpu_hash(padata);
+	queue = per_cpu_ptr(pd->pqueue, target_cpu);
+
+	spin_lock(&queue->parallel.lock);
+	list_add_tail(&padata->list, &queue->parallel.list);
+	spin_unlock(&queue->parallel.lock);
+
+	queue_work_on(target_cpu, pinst->wq, &queue->work);
+
+out:
+	rcu_read_unlock_bh();
+
+	return err;
+}
+EXPORT_SYMBOL(padata_do_parallel);
+
+/*
+ * padata_get_next - Get the next object that needs serialization.
+ *
+ * Return values are:
+ *
+ * A pointer to the control struct of the next object that needs
+ * serialization, if present in one of the percpu reorder queues.
+ *
+ * NULL, if all percpu reorder queues are empty.
+ *
+ * -EINPROGRESS, if the next object that needs serialization will
+ *  be parallel processed by another cpu and is not yet present in
+ *  the cpu's reorder queue.
+ *
+ * -ENODATA, if this cpu has to do the parallel processing for
+ *  the next object.
+ */
+static struct padata_priv *padata_get_next(struct parallel_data *pd)
+{
+	int cpu, num_cpus;
+	int next_nr, next_index;
+	struct padata_parallel_queue *queue, *next_queue;
+	struct padata_priv *padata;
+	struct padata_list *reorder;
+
+	num_cpus = cpumask_weight(pd->cpumask.pcpu);
+
+	/*
+	 * Calculate the percpu reorder queue and the sequence
+	 * number of the next object.
+	 */
+	next_nr = pd->processed;
+	next_index = next_nr % num_cpus;
+	cpu = padata_index_to_cpu(pd, next_index);
+	next_queue = per_cpu_ptr(pd->pqueue, cpu);
+
+	if (unlikely(next_nr > pd->max_seq_nr)) {
+		next_nr = next_nr - pd->max_seq_nr - 1;
+		next_index = next_nr % num_cpus;
+		cpu = padata_index_to_cpu(pd, next_index);
+		next_queue = per_cpu_ptr(pd->pqueue, cpu);
+		pd->processed = 0;
+	}
+
+	padata = NULL;
+
+	reorder = &next_queue->reorder;
+
+	if (!list_empty(&reorder->list)) {
+		padata = list_entry(reorder->list.next,
+				    struct padata_priv, list);
+
+		BUG_ON(next_nr != padata->seq_nr);
+
+		spin_lock(&reorder->lock);
+		list_del_init(&padata->list);
+		atomic_dec(&pd->reorder_objects);
+		spin_unlock(&reorder->lock);
+
+		pd->processed++;
+
+		goto out;
+	}
+
+	queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
+	if (queue->cpu_index == next_queue->cpu_index) {
+		padata = ERR_PTR(-ENODATA);
+		goto out;
+	}
+
+	padata = ERR_PTR(-EINPROGRESS);
+out:
+	return padata;
+}
+
+static void padata_reorder(struct parallel_data *pd)
+{
+	struct padata_priv *padata;
+	struct padata_serial_queue *squeue;
+	struct padata_instance *pinst = pd->pinst;
+
+	/*
+	 * We need to ensure that only one cpu can work on dequeueing of
+	 * the reorder queue the time. Calculating in which percpu reorder
+	 * queue the next object will arrive takes some time. A spinlock
+	 * would be highly contended. Also it is not clear in which order
+	 * the objects arrive to the reorder queues. So a cpu could wait to
+	 * get the lock just to notice that there is nothing to do at the
+	 * moment. Therefore we use a trylock and let the holder of the lock
+	 * care for all the objects enqueued during the holdtime of the lock.
+	 */
+	if (!spin_trylock_bh(&pd->lock))
+		return;
+
+	while (1) {
+		padata = padata_get_next(pd);
+
+		/*
+		 * All reorder queues are empty, or the next object that needs
+		 * serialization is parallel processed by another cpu and is
+		 * still on it's way to the cpu's reorder queue, nothing to
+		 * do for now.
+		 */
+		if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+			break;
+
+		/*
+		 * This cpu has to do the parallel processing of the next
+		 * object. It's waiting in the cpu's parallelization queue,
+		 * so exit imediately.
+		 */
+		if (PTR_ERR(padata) == -ENODATA) {
+			del_timer(&pd->timer);
+			spin_unlock_bh(&pd->lock);
+			return;
+		}
+
+		squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
+
+		spin_lock(&squeue->serial.lock);
+		list_add_tail(&padata->list, &squeue->serial.list);
+		spin_unlock(&squeue->serial.lock);
+
+		queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
+	}
+
+	spin_unlock_bh(&pd->lock);
+
+	/*
+	 * The next object that needs serialization might have arrived to
+	 * the reorder queues in the meantime, we will be called again
+	 * from the timer function if noone else cares for it.
+	 */
+	if (atomic_read(&pd->reorder_objects)
+			&& !(pinst->flags & PADATA_RESET))
+		mod_timer(&pd->timer, jiffies + HZ);
+	else
+		del_timer(&pd->timer);
+
+	return;
+}
+
+static void padata_reorder_timer(unsigned long arg)
+{
+	struct parallel_data *pd = (struct parallel_data *)arg;
+
+	padata_reorder(pd);
+}
+
+static void padata_serial_worker(struct work_struct *serial_work)
+{
+	struct padata_serial_queue *squeue;
+	struct parallel_data *pd;
+	LIST_HEAD(local_list);
+
+	local_bh_disable();
+	squeue = container_of(serial_work, struct padata_serial_queue, work);
+	pd = squeue->pd;
+
+	spin_lock(&squeue->serial.lock);
+	list_replace_init(&squeue->serial.list, &local_list);
+	spin_unlock(&squeue->serial.lock);
+
+	while (!list_empty(&local_list)) {
+		struct padata_priv *padata;
+
+		padata = list_entry(local_list.next,
+				    struct padata_priv, list);
+
+		list_del_init(&padata->list);
+
+		padata->serial(padata);
+		atomic_dec(&pd->refcnt);
+	}
+	local_bh_enable();
+}
+
+/**
+ * padata_do_serial - padata serialization function
+ *
+ * @padata: object to be serialized.
+ *
+ * padata_do_serial must be called for every parallelized object.
+ * The serialization callback function will run with BHs off.
+ */
+void padata_do_serial(struct padata_priv *padata)
+{
+	int cpu;
+	struct padata_parallel_queue *pqueue;
+	struct parallel_data *pd;
+
+	pd = padata->pd;
+
+	cpu = get_cpu();
+	pqueue = per_cpu_ptr(pd->pqueue, cpu);
+
+	spin_lock(&pqueue->reorder.lock);
+	atomic_inc(&pd->reorder_objects);
+	list_add_tail(&padata->list, &pqueue->reorder.list);
+	spin_unlock(&pqueue->reorder.lock);
+
+	put_cpu();
+
+	padata_reorder(pd);
+}
+EXPORT_SYMBOL(padata_do_serial);
+
+static int padata_setup_cpumasks(struct parallel_data *pd,
+				 const struct cpumask *pcpumask,
+				 const struct cpumask *cbcpumask)
+{
+	if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
+	if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
+		free_cpumask_var(pd->cpumask.cbcpu);
+		return -ENOMEM;
+	}
+
+	cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
+	return 0;
+}
+
+static void __padata_list_init(struct padata_list *pd_list)
+{
+	INIT_LIST_HEAD(&pd_list->list);
+	spin_lock_init(&pd_list->lock);
+}
+
+/* Initialize all percpu queues used by serial workers */
+static void padata_init_squeues(struct parallel_data *pd)
+{
+	int cpu;
+	struct padata_serial_queue *squeue;
+
+	for_each_cpu(cpu, pd->cpumask.cbcpu) {
+		squeue = per_cpu_ptr(pd->squeue, cpu);
+		squeue->pd = pd;
+		__padata_list_init(&squeue->serial);
+		INIT_WORK(&squeue->work, padata_serial_worker);
+	}
+}
+
+/* Initialize all percpu queues used by parallel workers */
+static void padata_init_pqueues(struct parallel_data *pd)
+{
+	int cpu_index, num_cpus, cpu;
+	struct padata_parallel_queue *pqueue;
+
+	cpu_index = 0;
+	for_each_cpu(cpu, pd->cpumask.pcpu) {
+		pqueue = per_cpu_ptr(pd->pqueue, cpu);
+		pqueue->pd = pd;
+		pqueue->cpu_index = cpu_index;
+		cpu_index++;
+
+		__padata_list_init(&pqueue->reorder);
+		__padata_list_init(&pqueue->parallel);
+		INIT_WORK(&pqueue->work, padata_parallel_worker);
+		atomic_set(&pqueue->num_obj, 0);
+	}
+
+	num_cpus = cpumask_weight(pd->cpumask.pcpu);
+	pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
+}
+
+/* Allocate and initialize the internal cpumask dependend resources. */
+static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+					     const struct cpumask *pcpumask,
+					     const struct cpumask *cbcpumask)
+{
+	struct parallel_data *pd;
+
+	pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+	if (!pd)
+		goto err;
+
+	pd->pqueue = alloc_percpu(struct padata_parallel_queue);
+	if (!pd->pqueue)
+		goto err_free_pd;
+
+	pd->squeue = alloc_percpu(struct padata_serial_queue);
+	if (!pd->squeue)
+		goto err_free_pqueue;
+	if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
+		goto err_free_squeue;
+
+	padata_init_pqueues(pd);
+	padata_init_squeues(pd);
+	setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
+	atomic_set(&pd->seq_nr, -1);
+	atomic_set(&pd->reorder_objects, 0);
+	atomic_set(&pd->refcnt, 0);
+	pd->pinst = pinst;
+	spin_lock_init(&pd->lock);
+
+	return pd;
+
+err_free_squeue:
+	free_percpu(pd->squeue);
+err_free_pqueue:
+	free_percpu(pd->pqueue);
+err_free_pd:
+	kfree(pd);
+err:
+	return NULL;
+}
+
+static void padata_free_pd(struct parallel_data *pd)
+{
+	free_cpumask_var(pd->cpumask.pcpu);
+	free_cpumask_var(pd->cpumask.cbcpu);
+	free_percpu(pd->pqueue);
+	free_percpu(pd->squeue);
+	kfree(pd);
+}
+
+/* Flush all objects out of the padata queues. */
+static void padata_flush_queues(struct parallel_data *pd)
+{
+	int cpu;
+	struct padata_parallel_queue *pqueue;
+	struct padata_serial_queue *squeue;
+
+	for_each_cpu(cpu, pd->cpumask.pcpu) {
+		pqueue = per_cpu_ptr(pd->pqueue, cpu);
+		flush_work(&pqueue->work);
+	}
+
+	del_timer_sync(&pd->timer);
+
+	if (atomic_read(&pd->reorder_objects))
+		padata_reorder(pd);
+
+	for_each_cpu(cpu, pd->cpumask.cbcpu) {
+		squeue = per_cpu_ptr(pd->squeue, cpu);
+		flush_work(&squeue->work);
+	}
+
+	BUG_ON(atomic_read(&pd->refcnt) != 0);
+}
+
+static void __padata_start(struct padata_instance *pinst)
+{
+	pinst->flags |= PADATA_INIT;
+}
+
+static void __padata_stop(struct padata_instance *pinst)
+{
+	if (!(pinst->flags & PADATA_INIT))
+		return;
+
+	pinst->flags &= ~PADATA_INIT;
+
+	synchronize_rcu();
+
+	get_online_cpus();
+	padata_flush_queues(pinst->pd);
+	put_online_cpus();
+}
+
+/* Replace the internal control stucture with a new one. */
+static void padata_replace(struct padata_instance *pinst,
+			   struct parallel_data *pd_new)
+{
+	struct parallel_data *pd_old = pinst->pd;
+	int notification_mask = 0;
+
+	pinst->flags |= PADATA_RESET;
+
+	rcu_assign_pointer(pinst->pd, pd_new);
+
+	synchronize_rcu();
+
+	if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
+		notification_mask |= PADATA_CPU_PARALLEL;
+	if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
+		notification_mask |= PADATA_CPU_SERIAL;
+
+	padata_flush_queues(pd_old);
+	padata_free_pd(pd_old);
+
+	if (notification_mask)
+		blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
+					     notification_mask,
+					     &pd_new->cpumask);
+
+	pinst->flags &= ~PADATA_RESET;
+}
+
+/**
+ * padata_register_cpumask_notifier - Registers a notifier that will be called
+ *                             if either pcpu or cbcpu or both cpumasks change.
+ *
+ * @pinst: A poineter to padata instance
+ * @nblock: A pointer to notifier block.
+ */
+int padata_register_cpumask_notifier(struct padata_instance *pinst,
+				     struct notifier_block *nblock)
+{
+	return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
+						nblock);
+}
+EXPORT_SYMBOL(padata_register_cpumask_notifier);
+
+/**
+ * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
+ *        registered earlier  using padata_register_cpumask_notifier
+ *
+ * @pinst: A pointer to data instance.
+ * @nlock: A pointer to notifier block.
+ */
+int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
+				       struct notifier_block *nblock)
+{
+	return blocking_notifier_chain_unregister(
+		&pinst->cpumask_change_notifier,
+		nblock);
+}
+EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
+
+
+/* If cpumask contains no active cpu, we mark the instance as invalid. */
+static bool padata_validate_cpumask(struct padata_instance *pinst,
+				    const struct cpumask *cpumask)
+{
+	if (!cpumask_intersects(cpumask, cpu_active_mask)) {
+		pinst->flags |= PADATA_INVALID;
+		return false;
+	}
+
+	pinst->flags &= ~PADATA_INVALID;
+	return true;
+}
+
+static int __padata_set_cpumasks(struct padata_instance *pinst,
+				 cpumask_var_t pcpumask,
+				 cpumask_var_t cbcpumask)
+{
+	int valid;
+	struct parallel_data *pd;
+
+	valid = padata_validate_cpumask(pinst, pcpumask);
+	if (!valid) {
+		__padata_stop(pinst);
+		goto out_replace;
+	}
+
+	valid = padata_validate_cpumask(pinst, cbcpumask);
+	if (!valid)
+		__padata_stop(pinst);
+
+out_replace:
+	pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
+	if (!pd)
+		return -ENOMEM;
+
+	cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+	cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
+
+	padata_replace(pinst, pd);
+
+	if (valid)
+		__padata_start(pinst);
+
+	return 0;
+}
+
+/**
+ * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
+ *                       one is used by parallel workers and the second one
+ *                       by the wokers doing serialization.
+ *
+ * @pinst: padata instance
+ * @pcpumask: the cpumask to use for parallel workers
+ * @cbcpumask: the cpumsak to use for serial workers
+ */
+int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
+			cpumask_var_t cbcpumask)
+{
+	int err;
+
+	mutex_lock(&pinst->lock);
+	get_online_cpus();
+
+	err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
+
+	put_online_cpus();
+	mutex_unlock(&pinst->lock);
+
+	return err;
+
+}
+EXPORT_SYMBOL(padata_set_cpumasks);
+
+/**
+ * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
+ *                     equivalent to @cpumask.
+ *
+ * @pinst: padata instance
+ * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
+ *                to parallel and serial cpumasks respectively.
+ * @cpumask: the cpumask to use
+ */
+int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
+		       cpumask_var_t cpumask)
+{
+	struct cpumask *serial_mask, *parallel_mask;
+	int err = -EINVAL;
+
+	mutex_lock(&pinst->lock);
+	get_online_cpus();
+
+	switch (cpumask_type) {
+	case PADATA_CPU_PARALLEL:
+		serial_mask = pinst->cpumask.cbcpu;
+		parallel_mask = cpumask;
+		break;
+	case PADATA_CPU_SERIAL:
+		parallel_mask = pinst->cpumask.pcpu;
+		serial_mask = cpumask;
+		break;
+	default:
+		 goto out;
+	}
+
+	err =  __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
+
+out:
+	put_online_cpus();
+	mutex_unlock(&pinst->lock);
+
+	return err;
+}
+EXPORT_SYMBOL(padata_set_cpumask);
+
+static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+	struct parallel_data *pd;
+
+	if (cpumask_test_cpu(cpu, cpu_active_mask)) {
+		pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
+				     pinst->cpumask.cbcpu);
+		if (!pd)
+			return -ENOMEM;
+
+		padata_replace(pinst, pd);
+
+		if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
+		    padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
+			__padata_start(pinst);
+	}
+
+	return 0;
+}
+
+ /**
+ * padata_add_cpu - add a cpu to one or both(parallel and serial)
+ *                  padata cpumasks.
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to add
+ * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
+ *        The @mask may be any combination of the following flags:
+ *          PADATA_CPU_SERIAL   - serial cpumask
+ *          PADATA_CPU_PARALLEL - parallel cpumask
+ */
+
+int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
+{
+	int err;
+
+	if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
+		return -EINVAL;
+
+	mutex_lock(&pinst->lock);
+
+	get_online_cpus();
+	if (mask & PADATA_CPU_SERIAL)
+		cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
+	if (mask & PADATA_CPU_PARALLEL)
+		cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
+
+	err = __padata_add_cpu(pinst, cpu);
+	put_online_cpus();
+
+	mutex_unlock(&pinst->lock);
+
+	return err;
+}
+EXPORT_SYMBOL(padata_add_cpu);
+
+static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+	struct parallel_data *pd = NULL;
+
+	if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+
+		if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
+		    !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
+			__padata_stop(pinst);
+
+		pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
+				     pinst->cpumask.cbcpu);
+		if (!pd)
+			return -ENOMEM;
+
+		padata_replace(pinst, pd);
+	}
+
+	return 0;
+}
+
+ /**
+ * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
+ *                     padata cpumasks.
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to remove
+ * @mask: bitmask specifying from which cpumask @cpu should be removed
+ *        The @mask may be any combination of the following flags:
+ *          PADATA_CPU_SERIAL   - serial cpumask
+ *          PADATA_CPU_PARALLEL - parallel cpumask
+ */
+int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
+{
+	int err;
+
+	if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
+		return -EINVAL;
+
+	mutex_lock(&pinst->lock);
+
+	get_online_cpus();
+	if (mask & PADATA_CPU_SERIAL)
+		cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
+	if (mask & PADATA_CPU_PARALLEL)
+		cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
+
+	err = __padata_remove_cpu(pinst, cpu);
+	put_online_cpus();
+
+	mutex_unlock(&pinst->lock);
+
+	return err;
+}
+EXPORT_SYMBOL(padata_remove_cpu);
+
+/**
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+int padata_start(struct padata_instance *pinst)
+{
+	int err = 0;
+
+	mutex_lock(&pinst->lock);
+
+	if (pinst->flags & PADATA_INVALID)
+		err =-EINVAL;
+
+	 __padata_start(pinst);
+
+	mutex_unlock(&pinst->lock);
+
+	return err;
+}
+EXPORT_SYMBOL(padata_start);
+
+/**
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+	mutex_lock(&pinst->lock);
+	__padata_stop(pinst);
+	mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
+{
+	return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
+		cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
+}
+
+
+static int padata_cpu_callback(struct notifier_block *nfb,
+			       unsigned long action, void *hcpu)
+{
+	int err;
+	struct padata_instance *pinst;
+	int cpu = (unsigned long)hcpu;
+
+	pinst = container_of(nfb, struct padata_instance, cpu_notifier);
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		if (!pinst_has_cpu(pinst, cpu))
+			break;
+		mutex_lock(&pinst->lock);
+		err = __padata_add_cpu(pinst, cpu);
+		mutex_unlock(&pinst->lock);
+		if (err)
+			return notifier_from_errno(err);
+		break;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		if (!pinst_has_cpu(pinst, cpu))
+			break;
+		mutex_lock(&pinst->lock);
+		err = __padata_remove_cpu(pinst, cpu);
+		mutex_unlock(&pinst->lock);
+		if (err)
+			return notifier_from_errno(err);
+		break;
+
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		if (!pinst_has_cpu(pinst, cpu))
+			break;
+		mutex_lock(&pinst->lock);
+		__padata_remove_cpu(pinst, cpu);
+		mutex_unlock(&pinst->lock);
+
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		if (!pinst_has_cpu(pinst, cpu))
+			break;
+		mutex_lock(&pinst->lock);
+		__padata_add_cpu(pinst, cpu);
+		mutex_unlock(&pinst->lock);
+	}
+
+	return NOTIFY_OK;
+}
+#endif
+
+static void __padata_free(struct padata_instance *pinst)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	unregister_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+
+	padata_stop(pinst);
+	padata_free_pd(pinst->pd);
+	free_cpumask_var(pinst->cpumask.pcpu);
+	free_cpumask_var(pinst->cpumask.cbcpu);
+	kfree(pinst);
+}
+
+#define kobj2pinst(_kobj)					\
+	container_of(_kobj, struct padata_instance, kobj)
+#define attr2pentry(_attr)					\
+	container_of(_attr, struct padata_sysfs_entry, attr)
+
+static void padata_sysfs_release(struct kobject *kobj)
+{
+	struct padata_instance *pinst = kobj2pinst(kobj);
+	__padata_free(pinst);
+}
+
+struct padata_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
+	ssize_t (*store)(struct padata_instance *, struct attribute *,
+			 const char *, size_t);
+};
+
+static ssize_t show_cpumask(struct padata_instance *pinst,
+			    struct attribute *attr,  char *buf)
+{
+	struct cpumask *cpumask;
+	ssize_t len;
+
+	mutex_lock(&pinst->lock);
+	if (!strcmp(attr->name, "serial_cpumask"))
+		cpumask = pinst->cpumask.cbcpu;
+	else
+		cpumask = pinst->cpumask.pcpu;
+
+	len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
+			       nr_cpu_ids);
+	if (PAGE_SIZE - len < 2)
+		len = -EINVAL;
+	else
+		len += sprintf(buf + len, "\n");
+
+	mutex_unlock(&pinst->lock);
+	return len;
+}
+
+static ssize_t store_cpumask(struct padata_instance *pinst,
+			     struct attribute *attr,
+			     const char *buf, size_t count)
+{
+	cpumask_var_t new_cpumask;
+	ssize_t ret;
+	int mask_type;
+
+	if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask),
+			   nr_cpumask_bits);
+	if (ret < 0)
+		goto out;
+
+	mask_type = !strcmp(attr->name, "serial_cpumask") ?
+		PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL;
+	ret = padata_set_cpumask(pinst, mask_type, new_cpumask);
+	if (!ret)
+		ret = count;
+
+out:
+	free_cpumask_var(new_cpumask);
+	return ret;
+}
+
+#define PADATA_ATTR_RW(_name, _show_name, _store_name)		\
+	static struct padata_sysfs_entry _name##_attr =		\
+		__ATTR(_name, 0644, _show_name, _store_name)
+#define PADATA_ATTR_RO(_name, _show_name)		\
+	static struct padata_sysfs_entry _name##_attr = \
+		__ATTR(_name, 0400, _show_name, NULL)
+
+PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
+PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
+
+/*
+ * Padata sysfs provides the following objects:
+ * serial_cpumask   [RW] - cpumask for serial workers
+ * parallel_cpumask [RW] - cpumask for parallel workers
+ */
+static struct attribute *padata_default_attrs[] = {
+	&serial_cpumask_attr.attr,
+	&parallel_cpumask_attr.attr,
+	NULL,
+};
+
+static ssize_t padata_sysfs_show(struct kobject *kobj,
+				 struct attribute *attr, char *buf)
+{
+	struct padata_instance *pinst;
+	struct padata_sysfs_entry *pentry;
+	ssize_t ret = -EIO;
+
+	pinst = kobj2pinst(kobj);
+	pentry = attr2pentry(attr);
+	if (pentry->show)
+		ret = pentry->show(pinst, attr, buf);
+
+	return ret;
+}
+
+static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct padata_instance *pinst;
+	struct padata_sysfs_entry *pentry;
+	ssize_t ret = -EIO;
+
+	pinst = kobj2pinst(kobj);
+	pentry = attr2pentry(attr);
+	if (pentry->show)
+		ret = pentry->store(pinst, attr, buf, count);
+
+	return ret;
+}
+
+static const struct sysfs_ops padata_sysfs_ops = {
+	.show = padata_sysfs_show,
+	.store = padata_sysfs_store,
+};
+
+static struct kobj_type padata_attr_type = {
+	.sysfs_ops = &padata_sysfs_ops,
+	.default_attrs = padata_default_attrs,
+	.release = padata_sysfs_release,
+};
+
+/**
+ * padata_alloc_possible - Allocate and initialize padata instance.
+ *                         Use the cpu_possible_mask for serial and
+ *                         parallel workers.
+ *
+ * @wq: workqueue to use for the allocated padata instance
+ */
+struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
+{
+	return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
+}
+EXPORT_SYMBOL(padata_alloc_possible);
+
+/**
+ * padata_alloc - allocate and initialize a padata instance and specify
+ *                cpumasks for serial and parallel workers.
+ *
+ * @wq: workqueue to use for the allocated padata instance
+ * @pcpumask: cpumask that will be used for padata parallelization
+ * @cbcpumask: cpumask that will be used for padata serialization
+ */
+struct padata_instance *padata_alloc(struct workqueue_struct *wq,
+				     const struct cpumask *pcpumask,
+				     const struct cpumask *cbcpumask)
+{
+	struct padata_instance *pinst;
+	struct parallel_data *pd = NULL;
+
+	pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
+	if (!pinst)
+		goto err;
+
+	get_online_cpus();
+	if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
+		goto err_free_inst;
+	if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
+		free_cpumask_var(pinst->cpumask.pcpu);
+		goto err_free_inst;
+	}
+	if (!padata_validate_cpumask(pinst, pcpumask) ||
+	    !padata_validate_cpumask(pinst, cbcpumask))
+		goto err_free_masks;
+
+	pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
+	if (!pd)
+		goto err_free_masks;
+
+	rcu_assign_pointer(pinst->pd, pd);
+
+	pinst->wq = wq;
+
+	cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+	cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
+
+	pinst->flags = 0;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	pinst->cpu_notifier.notifier_call = padata_cpu_callback;
+	pinst->cpu_notifier.priority = 0;
+	register_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+
+	put_online_cpus();
+
+	BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
+	kobject_init(&pinst->kobj, &padata_attr_type);
+	mutex_init(&pinst->lock);
+
+	return pinst;
+
+err_free_masks:
+	free_cpumask_var(pinst->cpumask.pcpu);
+	free_cpumask_var(pinst->cpumask.cbcpu);
+err_free_inst:
+	kfree(pinst);
+	put_online_cpus();
+err:
+	return NULL;
+}
+EXPORT_SYMBOL(padata_alloc);
+
+/**
+ * padata_free - free a padata instance
+ *
+ * @padata_inst: padata instance to free
+ */
+void padata_free(struct padata_instance *pinst)
+{
+	kobject_put(&pinst->kobj);
+}
+EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index c787333282b..4c13b1a88eb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,9 @@
 #include <linux/nmi.h>
 #include <linux/dmi.h>
 
+#define PANIC_TIMER_STEP 100
+#define PANIC_BLINK_SPD 18
+
 int panic_on_oops;
 static unsigned long tainted_mask;
 static int pause_on_oops;
@@ -36,13 +39,13 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
 EXPORT_SYMBOL(panic_notifier_list);
 
-static long no_blink(long time)
+static long no_blink(int state)
 {
 	return 0;
 }
 
 /* Returns how long it waited in ms */
-long (*panic_blink)(long time);
+long (*panic_blink)(int state);
 EXPORT_SYMBOL(panic_blink);
 
 /**
@@ -57,7 +60,8 @@ NORET_TYPE void panic(const char * fmt, ...)
 {
 	static char buf[1024];
 	va_list args;
-	long i;
+	long i, i_next = 0;
+	int state = 0;
 
 	/*
 	 * It's possible to come here directly from a panic-assertion and
@@ -66,6 +70,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	 */
 	preempt_disable();
 
+	console_verbose();
 	bust_spinlocks(1);
 	va_start(args, fmt);
 	vsnprintf(buf, sizeof(buf), fmt, args);
@@ -105,11 +110,13 @@ NORET_TYPE void panic(const char * fmt, ...)
 		 */
 		printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
 
-		for (i = 0; i < panic_timeout*1000; ) {
+		for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
 			touch_nmi_watchdog();
-			i += panic_blink(i);
-			mdelay(1);
-			i++;
+			if (i >= i_next) {
+				i += panic_blink(state ^= 1);
+				i_next = i + 3600 / PANIC_BLINK_SPD;
+			}
+			mdelay(PANIC_TIMER_STEP);
 		}
 		/*
 		 * This will not be a clean reboot, with everything
@@ -135,11 +142,13 @@ NORET_TYPE void panic(const char * fmt, ...)
 	}
 #endif
 	local_irq_enable();
-	for (i = 0; ; ) {
+	for (i = 0; ; i += PANIC_TIMER_STEP) {
 		touch_softlockup_watchdog();
-		i += panic_blink(i);
-		mdelay(1);
-		i++;
+		if (i >= i_next) {
+			i += panic_blink(state ^= 1);
+			i_next = i + 3600 / PANIC_BLINK_SPD;
+		}
+		mdelay(PANIC_TIMER_STEP);
 	}
 }
 
@@ -164,6 +173,7 @@ static const struct tnt tnts[] = {
 	{ TAINT_OVERRIDDEN_ACPI_TABLE,	'A', ' ' },
 	{ TAINT_WARN,			'W', ' ' },
 	{ TAINT_CRAP,			'C', ' ' },
+	{ TAINT_FIRMWARE_WORKAROUND,	'I', ' ' },
 };
 
 /**
@@ -180,6 +190,7 @@ static const struct tnt tnts[] = {
  *  'A' - ACPI table overridden.
  *  'W' - Taint on warning.
  *  'C' - modules from drivers/staging are loaded.
+ *  'I' - Working around severe firmware bug.
  *
  *	The string is overwritten by the next call to print_tainted().
  */
@@ -327,7 +338,7 @@ static int init_oops_id(void)
 }
 late_initcall(init_oops_id);
 
-static void print_oops_end_marker(void)
+void print_oops_end_marker(void)
 {
 	init_oops_id();
 	printk(KERN_WARNING "---[ end trace %016llx ]---\n",
@@ -351,7 +362,8 @@ struct slowpath_args {
 	va_list args;
 };
 
-static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args)
+static void warn_slowpath_common(const char *file, int line, void *caller,
+				 unsigned taint, struct slowpath_args *args)
 {
 	const char *board;
 
@@ -367,7 +379,7 @@ static void warn_slowpath_common(const char *file, int line, void *caller, struc
 	print_modules();
 	dump_stack();
 	print_oops_end_marker();
-	add_taint(TAINT_WARN);
+	add_taint(taint);
 }
 
 void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
@@ -376,14 +388,29 @@ void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
 
 	args.fmt = fmt;
 	va_start(args.args, fmt);
-	warn_slowpath_common(file, line, __builtin_return_address(0), &args);
+	warn_slowpath_common(file, line, __builtin_return_address(0),
+			     TAINT_WARN, &args);
 	va_end(args.args);
 }
 EXPORT_SYMBOL(warn_slowpath_fmt);
 
+void warn_slowpath_fmt_taint(const char *file, int line,
+			     unsigned taint, const char *fmt, ...)
+{
+	struct slowpath_args args;
+
+	args.fmt = fmt;
+	va_start(args.args, fmt);
+	warn_slowpath_common(file, line, __builtin_return_address(0),
+			     taint, &args);
+	va_end(args.args);
+}
+EXPORT_SYMBOL(warn_slowpath_fmt_taint);
+
 void warn_slowpath_null(const char *file, int line)
 {
-	warn_slowpath_common(file, line, __builtin_return_address(0), NULL);
+	warn_slowpath_common(file, line, __builtin_return_address(0),
+			     TAINT_WARN, NULL);
 }
 EXPORT_SYMBOL(warn_slowpath_null);
 #endif
diff --git a/kernel/params.c b/kernel/params.c
index cf1b6918312..08107d18175 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,7 +24,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
-#include <linux/string.h>
 
 #if 0
 #define DEBUGP printk
@@ -32,6 +31,42 @@
 #define DEBUGP(fmt, a...)
 #endif
 
+/* Protects all parameters, and incidentally kmalloced_param list. */
+static DEFINE_MUTEX(param_lock);
+
+/* This just allows us to keep track of which parameters are kmalloced. */
+struct kmalloced_param {
+	struct list_head list;
+	char val[];
+};
+static LIST_HEAD(kmalloced_params);
+
+static void *kmalloc_parameter(unsigned int size)
+{
+	struct kmalloced_param *p;
+
+	p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
+	if (!p)
+		return NULL;
+
+	list_add(&p->list, &kmalloced_params);
+	return p->val;
+}
+
+/* Does nothing if parameter wasn't kmalloced above. */
+static void maybe_kfree_parameter(void *param)
+{
+	struct kmalloced_param *p;
+
+	list_for_each_entry(p, &kmalloced_params, list) {
+		if (p->val == param) {
+			list_del(&p->list);
+			kfree(p);
+			break;
+		}
+	}
+}
+
 static inline char dash2underscore(char c)
 {
 	if (c == '-')
@@ -50,18 +85,25 @@ static inline int parameq(const char *input, const char *paramname)
 
 static int parse_one(char *param,
 		     char *val,
-		     struct kernel_param *params, 
+		     const struct kernel_param *params,
 		     unsigned num_params,
 		     int (*handle_unknown)(char *param, char *val))
 {
 	unsigned int i;
+	int err;
 
 	/* Find parameter */
 	for (i = 0; i < num_params; i++) {
 		if (parameq(param, params[i].name)) {
+			/* Noone handled NULL, so do it here. */
+			if (!val && params[i].ops->set != param_set_bool)
+				return -EINVAL;
 			DEBUGP("They are equal!  Calling %p\n",
-			       params[i].set);
-			return params[i].set(val, &params[i]);
+			       params[i].ops->set);
+			mutex_lock(&param_lock);
+			err = params[i].ops->set(val, &params[i]);
+			mutex_unlock(&param_lock);
+			return err;
 		}
 	}
 
@@ -129,7 +171,7 @@ static char *next_arg(char *args, char **param, char **val)
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
 int parse_args(const char *name,
 	       char *args,
-	       struct kernel_param *params,
+	       const struct kernel_param *params,
 	       unsigned num,
 	       int (*unknown)(char *param, char *val))
 {
@@ -177,22 +219,29 @@ int parse_args(const char *name,
 
 /* Lazy bastard, eh? */
 #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn)      	\
-	int param_set_##name(const char *val, struct kernel_param *kp)	\
+	int param_set_##name(const char *val, const struct kernel_param *kp) \
 	{								\
 		tmptype l;						\
 		int ret;						\
 									\
-		if (!val) return -EINVAL;				\
 		ret = strtolfn(val, 0, &l);				\
 		if (ret == -EINVAL || ((type)l != l))			\
 			return -EINVAL;					\
 		*((type *)kp->arg) = l;					\
 		return 0;						\
 	}								\
-	int param_get_##name(char *buffer, struct kernel_param *kp)	\
+	int param_get_##name(char *buffer, const struct kernel_param *kp) \
 	{								\
 		return sprintf(buffer, format, *((type *)kp->arg));	\
-	}
+	}								\
+	struct kernel_param_ops param_ops_##name = {			\
+		.set = param_set_##name,				\
+		.get = param_get_##name,				\
+	};								\
+	EXPORT_SYMBOL(param_set_##name);				\
+	EXPORT_SYMBOL(param_get_##name);				\
+	EXPORT_SYMBOL(param_ops_##name)
+
 
 STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
 STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
@@ -202,39 +251,50 @@ STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
 STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
 STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
 
-int param_set_charp(const char *val, struct kernel_param *kp)
+int param_set_charp(const char *val, const struct kernel_param *kp)
 {
-	if (!val) {
-		printk(KERN_ERR "%s: string parameter expected\n",
-		       kp->name);
-		return -EINVAL;
-	}
-
 	if (strlen(val) > 1024) {
 		printk(KERN_ERR "%s: string parameter too long\n",
 		       kp->name);
 		return -ENOSPC;
 	}
 
-	/* This is a hack.  We can't need to strdup in early boot, and we
+	maybe_kfree_parameter(*(char **)kp->arg);
+
+	/* This is a hack.  We can't kmalloc in early boot, and we
 	 * don't need to; this mangled commandline is preserved. */
 	if (slab_is_available()) {
-		*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
+		*(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
 		if (!*(char **)kp->arg)
 			return -ENOMEM;
+		strcpy(*(char **)kp->arg, val);
 	} else
 		*(const char **)kp->arg = val;
 
 	return 0;
 }
+EXPORT_SYMBOL(param_set_charp);
 
-int param_get_charp(char *buffer, struct kernel_param *kp)
+int param_get_charp(char *buffer, const struct kernel_param *kp)
 {
 	return sprintf(buffer, "%s", *((char **)kp->arg));
 }
+EXPORT_SYMBOL(param_get_charp);
+
+static void param_free_charp(void *arg)
+{
+	maybe_kfree_parameter(*((char **)arg));
+}
+
+struct kernel_param_ops param_ops_charp = {
+	.set = param_set_charp,
+	.get = param_get_charp,
+	.free = param_free_charp,
+};
+EXPORT_SYMBOL(param_ops_charp);
 
 /* Actually could be a bool or an int, for historical reasons. */
-int param_set_bool(const char *val, struct kernel_param *kp)
+int param_set_bool(const char *val, const struct kernel_param *kp)
 {
 	bool v;
 
@@ -259,8 +319,9 @@ int param_set_bool(const char *val, struct kernel_param *kp)
 		*(int *)kp->arg = v;
 	return 0;
 }
+EXPORT_SYMBOL(param_set_bool);
 
-int param_get_bool(char *buffer, struct kernel_param *kp)
+int param_get_bool(char *buffer, const struct kernel_param *kp)
 {
 	bool val;
 	if (kp->flags & KPARAM_ISBOOL)
@@ -271,9 +332,16 @@ int param_get_bool(char *buffer, struct kernel_param *kp)
 	/* Y and N chosen as being relatively non-coder friendly */
 	return sprintf(buffer, "%c", val ? 'Y' : 'N');
 }
+EXPORT_SYMBOL(param_get_bool);
+
+struct kernel_param_ops param_ops_bool = {
+	.set = param_set_bool,
+	.get = param_get_bool,
+};
+EXPORT_SYMBOL(param_ops_bool);
 
 /* This one must be bool. */
-int param_set_invbool(const char *val, struct kernel_param *kp)
+int param_set_invbool(const char *val, const struct kernel_param *kp)
 {
 	int ret;
 	bool boolval;
@@ -286,18 +354,26 @@ int param_set_invbool(const char *val, struct kernel_param *kp)
 		*(bool *)kp->arg = !boolval;
 	return ret;
 }
+EXPORT_SYMBOL(param_set_invbool);
 
-int param_get_invbool(char *buffer, struct kernel_param *kp)
+int param_get_invbool(char *buffer, const struct kernel_param *kp)
 {
 	return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
 }
+EXPORT_SYMBOL(param_get_invbool);
+
+struct kernel_param_ops param_ops_invbool = {
+	.set = param_set_invbool,
+	.get = param_get_invbool,
+};
+EXPORT_SYMBOL(param_ops_invbool);
 
 /* We break the rule and mangle the string. */
 static int param_array(const char *name,
 		       const char *val,
 		       unsigned int min, unsigned int max,
 		       void *elem, int elemsize,
-		       int (*set)(const char *, struct kernel_param *kp),
+		       int (*set)(const char *, const struct kernel_param *kp),
 		       u16 flags,
 		       unsigned int *num)
 {
@@ -310,12 +386,6 @@ static int param_array(const char *name,
 	kp.arg = elem;
 	kp.flags = flags;
 
-	/* No equals sign? */
-	if (!val) {
-		printk(KERN_ERR "%s: expects arguments\n", name);
-		return -EINVAL;
-	}
-
 	*num = 0;
 	/* We expect a comma-separated list of values. */
 	do {
@@ -331,6 +401,7 @@ static int param_array(const char *name,
 		/* nul-terminate and parse */
 		save = val[len];
 		((char *)val)[len] = '\0';
+		BUG_ON(!mutex_is_locked(&param_lock));
 		ret = set(val, &kp);
 
 		if (ret != 0)
@@ -348,17 +419,17 @@ static int param_array(const char *name,
 	return 0;
 }
 
-int param_array_set(const char *val, struct kernel_param *kp)
+static int param_array_set(const char *val, const struct kernel_param *kp)
 {
 	const struct kparam_array *arr = kp->arr;
 	unsigned int temp_num;
 
 	return param_array(kp->name, val, 1, arr->max, arr->elem,
-			   arr->elemsize, arr->set, kp->flags,
+			   arr->elemsize, arr->ops->set, kp->flags,
 			   arr->num ?: &temp_num);
 }
 
-int param_array_get(char *buffer, struct kernel_param *kp)
+static int param_array_get(char *buffer, const struct kernel_param *kp)
 {
 	int i, off, ret;
 	const struct kparam_array *arr = kp->arr;
@@ -369,7 +440,8 @@ int param_array_get(char *buffer, struct kernel_param *kp)
 		if (i)
 			buffer[off++] = ',';
 		p.arg = arr->elem + arr->elemsize * i;
-		ret = arr->get(buffer + off, &p);
+		BUG_ON(!mutex_is_locked(&param_lock));
+		ret = arr->ops->get(buffer + off, &p);
 		if (ret < 0)
 			return ret;
 		off += ret;
@@ -378,14 +450,27 @@ int param_array_get(char *buffer, struct kernel_param *kp)
 	return off;
 }
 
-int param_set_copystring(const char *val, struct kernel_param *kp)
+static void param_array_free(void *arg)
+{
+	unsigned int i;
+	const struct kparam_array *arr = arg;
+
+	if (arr->ops->free)
+		for (i = 0; i < (arr->num ? *arr->num : arr->max); i++)
+			arr->ops->free(arr->elem + arr->elemsize * i);
+}
+
+struct kernel_param_ops param_array_ops = {
+	.set = param_array_set,
+	.get = param_array_get,
+	.free = param_array_free,
+};
+EXPORT_SYMBOL(param_array_ops);
+
+int param_set_copystring(const char *val, const struct kernel_param *kp)
 {
 	const struct kparam_string *kps = kp->str;
 
-	if (!val) {
-		printk(KERN_ERR "%s: missing param set value\n", kp->name);
-		return -EINVAL;
-	}
 	if (strlen(val)+1 > kps->maxlen) {
 		printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
 		       kp->name, kps->maxlen-1);
@@ -394,23 +479,31 @@ int param_set_copystring(const char *val, struct kernel_param *kp)
 	strcpy(kps->string, val);
 	return 0;
 }
+EXPORT_SYMBOL(param_set_copystring);
 
-int param_get_string(char *buffer, struct kernel_param *kp)
+int param_get_string(char *buffer, const struct kernel_param *kp)
 {
 	const struct kparam_string *kps = kp->str;
 	return strlcpy(buffer, kps->string, kps->maxlen);
 }
+EXPORT_SYMBOL(param_get_string);
+
+struct kernel_param_ops param_ops_string = {
+	.set = param_set_copystring,
+	.get = param_get_string,
+};
+EXPORT_SYMBOL(param_ops_string);
 
 /* sysfs output in /sys/modules/XYZ/parameters/ */
-#define to_module_attr(n) container_of(n, struct module_attribute, attr);
-#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
+#define to_module_attr(n) container_of(n, struct module_attribute, attr)
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
 
 extern struct kernel_param __start___param[], __stop___param[];
 
 struct param_attribute
 {
 	struct module_attribute mattr;
-	struct kernel_param *param;
+	const struct kernel_param *param;
 };
 
 struct module_param_attrs
@@ -421,7 +514,7 @@ struct module_param_attrs
 };
 
 #ifdef CONFIG_SYSFS
-#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
 
 static ssize_t param_attr_show(struct module_attribute *mattr,
 			       struct module *mod, char *buf)
@@ -429,10 +522,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
 	int count;
 	struct param_attribute *attribute = to_param_attr(mattr);
 
-	if (!attribute->param->get)
+	if (!attribute->param->ops->get)
 		return -EPERM;
 
-	count = attribute->param->get(buf, attribute->param);
+	mutex_lock(&param_lock);
+	count = attribute->param->ops->get(buf, attribute->param);
+	mutex_unlock(&param_lock);
 	if (count > 0) {
 		strcat(buf, "\n");
 		++count;
@@ -448,10 +543,12 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
  	int err;
 	struct param_attribute *attribute = to_param_attr(mattr);
 
-	if (!attribute->param->set)
+	if (!attribute->param->ops->set)
 		return -EPERM;
 
-	err = attribute->param->set(buf, attribute->param);
+	mutex_lock(&param_lock);
+	err = attribute->param->ops->set(buf, attribute->param);
+	mutex_unlock(&param_lock);
 	if (!err)
 		return len;
 	return err;
@@ -465,6 +562,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
 #endif
 
 #ifdef CONFIG_SYSFS
+void __kernel_param_lock(void)
+{
+	mutex_lock(&param_lock);
+}
+EXPORT_SYMBOL(__kernel_param_lock);
+
+void __kernel_param_unlock(void)
+{
+	mutex_unlock(&param_lock);
+}
+EXPORT_SYMBOL(__kernel_param_unlock);
+
 /*
  * add_sysfs_param - add a parameter to sysfs
  * @mk: struct module_kobject
@@ -476,7 +585,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
  * if there's an error.
  */
 static __modinit int add_sysfs_param(struct module_kobject *mk,
-				     struct kernel_param *kp,
+				     const struct kernel_param *kp,
 				     const char *name)
 {
 	struct module_param_attrs *new;
@@ -517,6 +626,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
 	new->grp.attrs = attrs;
 
 	/* Tack new one on the end. */
+	sysfs_attr_init(&new->attrs[num].mattr.attr);
 	new->attrs[num].param = kp;
 	new->attrs[num].mattr.show = param_attr_show;
 	new->attrs[num].mattr.store = param_attr_store;
@@ -557,7 +667,7 @@ static void free_module_param_attrs(struct module_kobject *mk)
  * /sys/module/[mod->name]/parameters/
  */
 int module_param_sysfs_setup(struct module *mod,
-			     struct kernel_param *kparam,
+			     const struct kernel_param *kparam,
 			     unsigned int num_params)
 {
 	int i, err;
@@ -602,7 +712,11 @@ void module_param_sysfs_remove(struct module *mod)
 
 void destroy_params(const struct kernel_param *params, unsigned num)
 {
-	/* FIXME: This should free kmalloced charp parameters.  It doesn't. */
+	unsigned int i;
+
+	for (i = 0; i < num; i++)
+		if (params[i].ops->free)
+			params[i].ops->free(params[i].arg);
 }
 
 static void __init kernel_add_sysfs_param(const char *name,
@@ -723,7 +837,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
 	return ret;
 }
 
-static struct sysfs_ops module_sysfs_ops = {
+static const struct sysfs_ops module_sysfs_ops = {
 	.show = module_attr_show,
 	.store = module_attr_store,
 };
@@ -737,7 +851,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
 	return 0;
 }
 
-static struct kset_uevent_ops module_uevent_ops = {
+static const struct kset_uevent_ops module_uevent_ops = {
 	.filter = uevent_filter,
 };
 
@@ -768,28 +882,3 @@ static int __init param_sysfs_init(void)
 subsys_initcall(param_sysfs_init);
 
 #endif /* CONFIG_SYSFS */
-
-EXPORT_SYMBOL(param_set_byte);
-EXPORT_SYMBOL(param_get_byte);
-EXPORT_SYMBOL(param_set_short);
-EXPORT_SYMBOL(param_get_short);
-EXPORT_SYMBOL(param_set_ushort);
-EXPORT_SYMBOL(param_get_ushort);
-EXPORT_SYMBOL(param_set_int);
-EXPORT_SYMBOL(param_get_int);
-EXPORT_SYMBOL(param_set_uint);
-EXPORT_SYMBOL(param_get_uint);
-EXPORT_SYMBOL(param_set_long);
-EXPORT_SYMBOL(param_get_long);
-EXPORT_SYMBOL(param_set_ulong);
-EXPORT_SYMBOL(param_get_ulong);
-EXPORT_SYMBOL(param_set_charp);
-EXPORT_SYMBOL(param_get_charp);
-EXPORT_SYMBOL(param_set_bool);
-EXPORT_SYMBOL(param_get_bool);
-EXPORT_SYMBOL(param_set_invbool);
-EXPORT_SYMBOL(param_get_invbool);
-EXPORT_SYMBOL(param_array_set);
-EXPORT_SYMBOL(param_array_get);
-EXPORT_SYMBOL(param_set_copystring);
-EXPORT_SYMBOL(param_get_string);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index d27746bd3a0..b98bed3d818 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -15,6 +15,8 @@
 #include <linux/smp.h>
 #include <linux/file.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -56,21 +58,6 @@ static atomic_t nr_task_events __read_mostly;
  */
 int sysctl_perf_event_paranoid __read_mostly = 1;
 
-static inline bool perf_paranoid_tracepoint_raw(void)
-{
-	return sysctl_perf_event_paranoid > -1;
-}
-
-static inline bool perf_paranoid_cpu(void)
-{
-	return sysctl_perf_event_paranoid > 0;
-}
-
-static inline bool perf_paranoid_kernel(void)
-{
-	return sysctl_perf_event_paranoid > 1;
-}
-
 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
 
 /*
@@ -96,40 +83,19 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
 void __weak hw_perf_disable(void)		{ barrier(); }
 void __weak hw_perf_enable(void)		{ barrier(); }
 
-void __weak hw_perf_event_setup(int cpu)	{ barrier(); }
-void __weak hw_perf_event_setup_online(int cpu)	{ barrier(); }
-
-int __weak
-hw_perf_group_sched_in(struct perf_event *group_leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx, int cpu)
-{
-	return 0;
-}
-
 void __weak perf_event_print_debug(void)	{ }
 
 static DEFINE_PER_CPU(int, perf_disable_count);
 
-void __perf_disable(void)
-{
-	__get_cpu_var(perf_disable_count)++;
-}
-
-bool __perf_enable(void)
-{
-	return !--__get_cpu_var(perf_disable_count);
-}
-
 void perf_disable(void)
 {
-	__perf_disable();
-	hw_perf_disable();
+	if (!__get_cpu_var(perf_disable_count)++)
+		hw_perf_disable();
 }
 
 void perf_enable(void)
 {
-	if (__perf_enable())
+	if (!--__get_cpu_var(perf_disable_count))
 		hw_perf_enable();
 }
 
@@ -248,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 
 static inline u64 perf_clock(void)
 {
-	return cpu_clock(smp_processor_id());
+	return local_clock();
 }
 
 /*
@@ -290,24 +256,49 @@ static void update_event_times(struct perf_event *event)
 }
 
 /*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+	struct perf_event *event;
+
+	update_event_times(leader);
+	list_for_each_entry(event, &leader->sibling_list, group_entry)
+		update_event_times(event);
+}
+
+static struct list_head *
+ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+{
+	if (event->attr.pinned)
+		return &ctx->pinned_groups;
+	else
+		return &ctx->flexible_groups;
+}
+
+/*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
  */
 static void
 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-	struct perf_event *group_leader = event->group_leader;
+	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+	event->attach_state |= PERF_ATTACH_CONTEXT;
 
 	/*
-	 * Depending on whether it is a standalone or sibling event,
-	 * add it straight to the context's event list, or to the group
-	 * leader's sibling list:
+	 * If we're a stand alone event or group leader, we go to the context
+	 * list, group events are kept attached to the group so that
+	 * perf_group_detach can, at all times, locate all siblings.
 	 */
-	if (group_leader == event)
-		list_add_tail(&event->group_entry, &ctx->group_list);
-	else {
-		list_add_tail(&event->group_entry, &group_leader->sibling_list);
-		group_leader->nr_siblings++;
+	if (event->group_leader == event) {
+		struct list_head *list;
+
+		if (is_software_event(event))
+			event->group_flags |= PERF_GROUP_SOFTWARE;
+
+		list = ctx_group_list(event, ctx);
+		list_add_tail(&event->group_entry, list);
 	}
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
@@ -316,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 		ctx->nr_stat++;
 }
 
+static void perf_group_attach(struct perf_event *event)
+{
+	struct perf_event *group_leader = event->group_leader;
+
+	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
+	event->attach_state |= PERF_ATTACH_GROUP;
+
+	if (group_leader == event)
+		return;
+
+	if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+			!is_software_event(event))
+		group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+
+	list_add_tail(&event->group_entry, &group_leader->sibling_list);
+	group_leader->nr_siblings++;
+}
+
 /*
  * Remove a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -323,21 +332,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-	struct perf_event *sibling, *tmp;
-
-	if (list_empty(&event->group_entry))
+	/*
+	 * We can have double detach due to exit/hot-unplug + close.
+	 */
+	if (!(event->attach_state & PERF_ATTACH_CONTEXT))
 		return;
+
+	event->attach_state &= ~PERF_ATTACH_CONTEXT;
+
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
 
-	list_del_init(&event->group_entry);
 	list_del_rcu(&event->event_entry);
 
-	if (event->group_leader != event)
-		event->group_leader->nr_siblings--;
+	if (event->group_leader == event)
+		list_del_init(&event->group_entry);
 
-	update_event_times(event);
+	update_group_times(event);
 
 	/*
 	 * If event was in error state, then keep it
@@ -348,24 +360,73 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	 */
 	if (event->state > PERF_EVENT_STATE_OFF)
 		event->state = PERF_EVENT_STATE_OFF;
+}
+
+static void perf_group_detach(struct perf_event *event)
+{
+	struct perf_event *sibling, *tmp;
+	struct list_head *list = NULL;
+
+	/*
+	 * We can have double detach due to exit/hot-unplug + close.
+	 */
+	if (!(event->attach_state & PERF_ATTACH_GROUP))
+		return;
+
+	event->attach_state &= ~PERF_ATTACH_GROUP;
+
+	/*
+	 * If this is a sibling, remove it from its group.
+	 */
+	if (event->group_leader != event) {
+		list_del_init(&event->group_entry);
+		event->group_leader->nr_siblings--;
+		return;
+	}
+
+	if (!list_empty(&event->group_entry))
+		list = &event->group_entry;
 
 	/*
 	 * If this was a group event with sibling events then
 	 * upgrade the siblings to singleton events by adding them
-	 * to the context list directly:
+	 * to whatever list we are on.
 	 */
 	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
-
-		list_move_tail(&sibling->group_entry, &ctx->group_list);
+		if (list)
+			list_move_tail(&sibling->group_entry, list);
 		sibling->group_leader = sibling;
+
+		/* Inherit group flags from the previous leader */
+		sibling->group_flags = event->group_flags;
 	}
 }
 
+static inline int
+event_filter_match(struct perf_event *event)
+{
+	return event->cpu == -1 || event->cpu == smp_processor_id();
+}
+
 static void
 event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_event_context *ctx)
 {
+	u64 delta;
+	/*
+	 * An event which could not be activated because of
+	 * filter mismatch still needs to have its timings
+	 * maintained, otherwise bogus information is return
+	 * via read() for time_enabled, time_running:
+	 */
+	if (event->state == PERF_EVENT_STATE_INACTIVE
+	    && !event_filter_match(event)) {
+		delta = ctx->time - event->tstamp_stopped;
+		event->tstamp_running += delta;
+		event->tstamp_stopped = ctx->time;
+	}
+
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		return;
 
@@ -391,9 +452,7 @@ group_sched_out(struct perf_event *group_event,
 		struct perf_event_context *ctx)
 {
 	struct perf_event *event;
-
-	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
-		return;
+	int state = group_event->state;
 
 	event_sched_out(group_event, cpuctx, ctx);
 
@@ -403,7 +462,7 @@ group_sched_out(struct perf_event *group_event,
 	list_for_each_entry(event, &group_event->sibling_list, group_entry)
 		event_sched_out(event, cpuctx, ctx);
 
-	if (group_event->attr.exclusive)
+	if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
 		cpuctx->exclusive = 0;
 }
 
@@ -508,18 +567,6 @@ retry:
 }
 
 /*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
-	struct perf_event *event;
-
-	update_event_times(leader);
-	list_for_each_entry(event, &leader->sibling_list, group_entry)
-		update_event_times(event);
-}
-
-/*
  * Cross CPU call to disable a performance event
  */
 static void __perf_event_disable(void *info)
@@ -608,14 +655,13 @@ void perf_event_disable(struct perf_event *event)
 static int
 event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
-		 struct perf_event_context *ctx,
-		 int cpu)
+		 struct perf_event_context *ctx)
 {
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
 	event->state = PERF_EVENT_STATE_ACTIVE;
-	event->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
+	event->oncpu = smp_processor_id();
 	/*
 	 * The new state must be visible before we turn it on in the hardware:
 	 */
@@ -642,33 +688,40 @@ event_sched_in(struct perf_event *event,
 static int
 group_sched_in(struct perf_event *group_event,
 	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx,
-	       int cpu)
+	       struct perf_event_context *ctx)
 {
-	struct perf_event *event, *partial_group;
-	int ret;
+	struct perf_event *event, *partial_group = NULL;
+	const struct pmu *pmu = group_event->pmu;
+	bool txn = false;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
 
-	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
-	if (ret)
-		return ret < 0 ? ret : 0;
+	/* Check if group transaction availabe */
+	if (pmu->start_txn)
+		txn = true;
 
-	if (event_sched_in(group_event, cpuctx, ctx, cpu))
+	if (txn)
+		pmu->start_txn(pmu);
+
+	if (event_sched_in(group_event, cpuctx, ctx)) {
+		if (txn)
+			pmu->cancel_txn(pmu);
 		return -EAGAIN;
+	}
 
 	/*
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-		if (event_sched_in(event, cpuctx, ctx, cpu)) {
+		if (event_sched_in(event, cpuctx, ctx)) {
 			partial_group = event;
 			goto group_error;
 		}
 	}
 
-	return 0;
+	if (!txn || !pmu->commit_txn(pmu))
+		return 0;
 
 group_error:
 	/*
@@ -682,25 +735,10 @@ group_error:
 	}
 	event_sched_out(group_event, cpuctx, ctx);
 
-	return -EAGAIN;
-}
-
-/*
- * Return 1 for a group consisting entirely of software events,
- * 0 if the group contains any hardware events.
- */
-static int is_software_only_group(struct perf_event *leader)
-{
-	struct perf_event *event;
-
-	if (!is_software_event(leader))
-		return 0;
-
-	list_for_each_entry(event, &leader->sibling_list, group_entry)
-		if (!is_software_event(event))
-			return 0;
+	if (txn)
+		pmu->cancel_txn(pmu);
 
-	return 1;
+	return -EAGAIN;
 }
 
 /*
@@ -713,7 +751,7 @@ static int group_can_go_on(struct perf_event *event,
 	/*
 	 * Groups consisting entirely of software events can always go on.
 	 */
-	if (is_software_only_group(event))
+	if (event->group_flags & PERF_GROUP_SOFTWARE)
 		return 1;
 	/*
 	 * If an exclusive group is already on, no other hardware
@@ -738,6 +776,7 @@ static void add_event_to_ctx(struct perf_event *event,
 			       struct perf_event_context *ctx)
 {
 	list_add_event(event, ctx);
+	perf_group_attach(event);
 	event->tstamp_enabled = ctx->time;
 	event->tstamp_running = ctx->time;
 	event->tstamp_stopped = ctx->time;
@@ -754,7 +793,6 @@ static void __perf_install_in_context(void *info)
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *leader = event->group_leader;
-	int cpu = smp_processor_id();
 	int err;
 
 	/*
@@ -801,7 +839,7 @@ static void __perf_install_in_context(void *info)
 	if (!group_can_go_on(event, cpuctx, 1))
 		err = -EEXIST;
 	else
-		err = event_sched_in(event, cpuctx, ctx, cpu);
+		err = event_sched_in(event, cpuctx, ctx);
 
 	if (err) {
 		/*
@@ -943,11 +981,9 @@ static void __perf_event_enable(void *info)
 	} else {
 		perf_disable();
 		if (event == leader)
-			err = group_sched_in(event, cpuctx, ctx,
-					     smp_processor_id());
+			err = group_sched_in(event, cpuctx, ctx);
 		else
-			err = event_sched_in(event, cpuctx, ctx,
-					       smp_processor_id());
+			err = event_sched_in(event, cpuctx, ctx);
 		perf_enable();
 	}
 
@@ -1043,8 +1079,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
 	return 0;
 }
 
-void __perf_event_sched_out(struct perf_event_context *ctx,
-			      struct perf_cpu_context *cpuctx)
+enum event_type_t {
+	EVENT_FLEXIBLE = 0x1,
+	EVENT_PINNED = 0x2,
+	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+
+static void ctx_sched_out(struct perf_event_context *ctx,
+			  struct perf_cpu_context *cpuctx,
+			  enum event_type_t event_type)
 {
 	struct perf_event *event;
 
@@ -1055,10 +1098,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
 	update_context_time(ctx);
 
 	perf_disable();
-	if (ctx->nr_active) {
-		list_for_each_entry(event, &ctx->group_list, group_entry)
+	if (!ctx->nr_active)
+		goto out_enable;
+
+	if (event_type & EVENT_PINNED)
+		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
-	}
+
+	if (event_type & EVENT_FLEXIBLE)
+		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+			group_sched_out(event, cpuctx, ctx);
+
+ out_enable:
 	perf_enable();
  out:
 	raw_spin_unlock(&ctx->lock);
@@ -1115,9 +1166,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
 	 * In order to keep per-task stats reliable we need to flip the event
 	 * values when we flip the contexts.
 	 */
-	value = atomic64_read(&next_event->count);
-	value = atomic64_xchg(&event->count, value);
-	atomic64_set(&next_event->count, value);
+	value = local64_read(&next_event->count);
+	value = local64_xchg(&event->count, value);
+	local64_set(&next_event->count, value);
 
 	swap(event->total_time_enabled, next_event->total_time_enabled);
 	swap(event->total_time_running, next_event->total_time_running);
@@ -1170,17 +1221,15 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
  * not restart the event.
  */
 void perf_event_task_sched_out(struct task_struct *task,
-				 struct task_struct *next, int cpu)
+				 struct task_struct *next)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = task->perf_event_ctxp;
 	struct perf_event_context *next_ctx;
 	struct perf_event_context *parent;
-	struct pt_regs *regs;
 	int do_switch = 1;
 
-	regs = task_pt_regs(task);
-	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
+	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
 
 	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
@@ -1220,15 +1269,13 @@ void perf_event_task_sched_out(struct task_struct *task,
 	rcu_read_unlock();
 
 	if (do_switch) {
-		__perf_event_sched_out(ctx, cpuctx);
+		ctx_sched_out(ctx, cpuctx, EVENT_ALL);
 		cpuctx->task_ctx = NULL;
 	}
 }
 
-/*
- * Called with IRQs disabled
- */
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
+static void task_ctx_sched_out(struct perf_event_context *ctx,
+			       enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 
@@ -1238,47 +1285,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
 	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 		return;
 
-	__perf_event_sched_out(ctx, cpuctx);
+	ctx_sched_out(ctx, cpuctx, event_type);
 	cpuctx->task_ctx = NULL;
 }
 
 /*
  * Called with IRQs disabled
  */
-static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
+static void __perf_event_task_sched_out(struct perf_event_context *ctx)
+{
+	task_ctx_sched_out(ctx, EVENT_ALL);
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+			      enum event_type_t event_type)
 {
-	__perf_event_sched_out(&cpuctx->ctx, cpuctx);
+	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
 }
 
 static void
-__perf_event_sched_in(struct perf_event_context *ctx,
-			struct perf_cpu_context *cpuctx, int cpu)
+ctx_pinned_sched_in(struct perf_event_context *ctx,
+		    struct perf_cpu_context *cpuctx)
 {
 	struct perf_event *event;
-	int can_add_hw = 1;
 
-	raw_spin_lock(&ctx->lock);
-	ctx->is_active = 1;
-	if (likely(!ctx->nr_events))
-		goto out;
-
-	ctx->timestamp = perf_clock();
-
-	perf_disable();
-
-	/*
-	 * First go through the list and put on any pinned groups
-	 * in order to give them the best chance of going on.
-	 */
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
-		if (event->state <= PERF_EVENT_STATE_OFF ||
-		    !event->attr.pinned)
+	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
-		if (event->cpu != -1 && event->cpu != cpu)
+		if (event->cpu != -1 && event->cpu != smp_processor_id())
 			continue;
 
 		if (group_can_go_on(event, cpuctx, 1))
-			group_sched_in(event, cpuctx, ctx, cpu);
+			group_sched_in(event, cpuctx, ctx);
 
 		/*
 		 * If this pinned group hasn't been scheduled,
@@ -1289,32 +1330,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
 			event->state = PERF_EVENT_STATE_ERROR;
 		}
 	}
+}
 
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
-		/*
-		 * Ignore events in OFF or ERROR state, and
-		 * ignore pinned events since we did them already.
-		 */
-		if (event->state <= PERF_EVENT_STATE_OFF ||
-		    event->attr.pinned)
-			continue;
+static void
+ctx_flexible_sched_in(struct perf_event_context *ctx,
+		      struct perf_cpu_context *cpuctx)
+{
+	struct perf_event *event;
+	int can_add_hw = 1;
 
+	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+		/* Ignore events in OFF or ERROR state */
+		if (event->state <= PERF_EVENT_STATE_OFF)
+			continue;
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of events:
 		 */
-		if (event->cpu != -1 && event->cpu != cpu)
+		if (event->cpu != -1 && event->cpu != smp_processor_id())
 			continue;
 
 		if (group_can_go_on(event, cpuctx, can_add_hw))
-			if (group_sched_in(event, cpuctx, ctx, cpu))
+			if (group_sched_in(event, cpuctx, ctx))
 				can_add_hw = 0;
 	}
+}
+
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+	     struct perf_cpu_context *cpuctx,
+	     enum event_type_t event_type)
+{
+	raw_spin_lock(&ctx->lock);
+	ctx->is_active = 1;
+	if (likely(!ctx->nr_events))
+		goto out;
+
+	ctx->timestamp = perf_clock();
+
+	perf_disable();
+
+	/*
+	 * First go through the list and put on any pinned groups
+	 * in order to give them the best chance of going on.
+	 */
+	if (event_type & EVENT_PINNED)
+		ctx_pinned_sched_in(ctx, cpuctx);
+
+	/* Then walk through the lower prio flexible groups */
+	if (event_type & EVENT_FLEXIBLE)
+		ctx_flexible_sched_in(ctx, cpuctx);
+
 	perf_enable();
  out:
 	raw_spin_unlock(&ctx->lock);
 }
 
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+			     enum event_type_t event_type)
+{
+	struct perf_event_context *ctx = &cpuctx->ctx;
+
+	ctx_sched_in(ctx, cpuctx, event_type);
+}
+
+static void task_ctx_sched_in(struct task_struct *task,
+			      enum event_type_t event_type)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event_context *ctx = task->perf_event_ctxp;
+
+	if (likely(!ctx))
+		return;
+	if (cpuctx->task_ctx == ctx)
+		return;
+	ctx_sched_in(ctx, cpuctx, event_type);
+	cpuctx->task_ctx = ctx;
+}
 /*
  * Called from scheduler to add the events of the current task
  * with interrupts disabled.
@@ -1326,38 +1418,135 @@ __perf_event_sched_in(struct perf_event_context *ctx,
  * accessing the event control register. If a NMI hits, then it will
  * keep the event running.
  */
-void perf_event_task_sched_in(struct task_struct *task, int cpu)
+void perf_event_task_sched_in(struct task_struct *task)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = task->perf_event_ctxp;
 
 	if (likely(!ctx))
 		return;
+
 	if (cpuctx->task_ctx == ctx)
 		return;
-	__perf_event_sched_in(ctx, cpuctx, cpu);
+
+	perf_disable();
+
+	/*
+	 * We want to keep the following priority order:
+	 * cpu pinned (that don't need to move), task pinned,
+	 * cpu flexible, task flexible.
+	 */
+	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+
+	ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+
 	cpuctx->task_ctx = ctx;
+
+	perf_enable();
 }
 
-static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_event *event, int enable);
+
+static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
 {
-	struct perf_event_context *ctx = &cpuctx->ctx;
+	u64 frequency = event->attr.sample_freq;
+	u64 sec = NSEC_PER_SEC;
+	u64 divisor, dividend;
+
+	int count_fls, nsec_fls, frequency_fls, sec_fls;
+
+	count_fls = fls64(count);
+	nsec_fls = fls64(nsec);
+	frequency_fls = fls64(frequency);
+	sec_fls = 30;
+
+	/*
+	 * We got @count in @nsec, with a target of sample_freq HZ
+	 * the target period becomes:
+	 *
+	 *             @count * 10^9
+	 * period = -------------------
+	 *          @nsec * sample_freq
+	 *
+	 */
+
+	/*
+	 * Reduce accuracy by one bit such that @a and @b converge
+	 * to a similar magnitude.
+	 */
+#define REDUCE_FLS(a, b) 		\
+do {					\
+	if (a##_fls > b##_fls) {	\
+		a >>= 1;		\
+		a##_fls--;		\
+	} else {			\
+		b >>= 1;		\
+		b##_fls--;		\
+	}				\
+} while (0)
+
+	/*
+	 * Reduce accuracy until either term fits in a u64, then proceed with
+	 * the other, so that finally we can do a u64/u64 division.
+	 */
+	while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
+		REDUCE_FLS(nsec, frequency);
+		REDUCE_FLS(sec, count);
+	}
+
+	if (count_fls + sec_fls > 64) {
+		divisor = nsec * frequency;
+
+		while (count_fls + sec_fls > 64) {
+			REDUCE_FLS(count, sec);
+			divisor >>= 1;
+		}
+
+		dividend = count * sec;
+	} else {
+		dividend = count * sec;
+
+		while (nsec_fls + frequency_fls > 64) {
+			REDUCE_FLS(nsec, frequency);
+			dividend >>= 1;
+		}
+
+		divisor = nsec * frequency;
+	}
 
-	__perf_event_sched_in(ctx, cpuctx, cpu);
+	if (!divisor)
+		return dividend;
+
+	return div64_u64(dividend, divisor);
 }
 
-#define MAX_INTERRUPTS (~0ULL)
+static void perf_event_stop(struct perf_event *event)
+{
+	if (!event->pmu->stop)
+		return event->pmu->disable(event);
 
-static void perf_log_throttle(struct perf_event *event, int enable);
+	return event->pmu->stop(event);
+}
+
+static int perf_event_start(struct perf_event *event)
+{
+	if (!event->pmu->start)
+		return event->pmu->enable(event);
+
+	return event->pmu->start(event);
+}
 
-static void perf_adjust_period(struct perf_event *event, u64 events)
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	u64 period, sample_period;
+	s64 period, sample_period;
 	s64 delta;
 
-	events *= hwc->sample_period;
-	period = div64_u64(events, event->attr.sample_freq);
+	period = perf_calculate_period(event, nsec, count);
 
 	delta = (s64)(period - hwc->sample_period);
 	delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1557,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
 		sample_period = 1;
 
 	hwc->sample_period = sample_period;
+
+	if (local64_read(&hwc->period_left) > 8*sample_period) {
+		perf_disable();
+		perf_event_stop(event);
+		local64_set(&hwc->period_left, 0);
+		perf_event_start(event);
+		perf_enable();
+	}
 }
 
 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 {
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
-	u64 interrupts, freq;
+	u64 interrupts, now;
+	s64 delta;
 
 	raw_spin_lock(&ctx->lock);
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1394,45 +1592,23 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 		 */
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(event, 1);
+			perf_disable();
 			event->pmu->unthrottle(event);
-			interrupts = 2*sysctl_perf_event_sample_rate/HZ;
+			perf_enable();
 		}
 
 		if (!event->attr.freq || !event->attr.sample_freq)
 			continue;
 
-		/*
-		 * if the specified freq < HZ then we need to skip ticks
-		 */
-		if (event->attr.sample_freq < HZ) {
-			freq = event->attr.sample_freq;
-
-			hwc->freq_count += freq;
-			hwc->freq_interrupts += interrupts;
-
-			if (hwc->freq_count < HZ)
-				continue;
-
-			interrupts = hwc->freq_interrupts;
-			hwc->freq_interrupts = 0;
-			hwc->freq_count -= HZ;
-		} else
-			freq = HZ;
-
-		perf_adjust_period(event, freq * interrupts);
+		perf_disable();
+		event->pmu->read(event);
+		now = local64_read(&event->count);
+		delta = now - hwc->freq_count_stamp;
+		hwc->freq_count_stamp = now;
 
-		/*
-		 * In order to avoid being stalled by an (accidental) huge
-		 * sample period, force reset the sample period if we didn't
-		 * get any events in this freq period.
-		 */
-		if (!interrupts) {
-			perf_disable();
-			event->pmu->disable(event);
-			atomic64_set(&hwc->period_left, 0);
-			event->pmu->enable(event);
-			perf_enable();
-		}
+		if (delta > 0)
+			perf_adjust_period(event, TICK_NSEC, delta);
+		perf_enable();
 	}
 	raw_spin_unlock(&ctx->lock);
 }
@@ -1442,51 +1618,67 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
  */
 static void rotate_ctx(struct perf_event_context *ctx)
 {
-	struct perf_event *event;
-
-	if (!ctx->nr_events)
-		return;
-
 	raw_spin_lock(&ctx->lock);
-	/*
-	 * Rotate the first entry last (works just fine for group events too):
-	 */
-	perf_disable();
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
-		list_move_tail(&event->group_entry, &ctx->group_list);
-		break;
-	}
-	perf_enable();
+
+	/* Rotate the first entry last of non-pinned groups */
+	list_rotate_left(&ctx->flexible_groups);
 
 	raw_spin_unlock(&ctx->lock);
 }
 
-void perf_event_task_tick(struct task_struct *curr, int cpu)
+void perf_event_task_tick(struct task_struct *curr)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
+	int rotate = 0;
 
 	if (!atomic_read(&nr_events))
 		return;
 
-	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	cpuctx = &__get_cpu_var(perf_cpu_context);
+	if (cpuctx->ctx.nr_events &&
+	    cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+		rotate = 1;
+
 	ctx = curr->perf_event_ctxp;
+	if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
+		rotate = 1;
 
 	perf_ctx_adjust_freq(&cpuctx->ctx);
 	if (ctx)
 		perf_ctx_adjust_freq(ctx);
 
-	perf_event_cpu_sched_out(cpuctx);
+	if (!rotate)
+		return;
+
+	perf_disable();
+	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
-		__perf_event_task_sched_out(ctx);
+		task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
 
 	rotate_ctx(&cpuctx->ctx);
 	if (ctx)
 		rotate_ctx(ctx);
 
-	perf_event_cpu_sched_in(cpuctx, cpu);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
-		perf_event_task_sched_in(curr, cpu);
+		task_ctx_sched_in(curr, EVENT_FLEXIBLE);
+	perf_enable();
+}
+
+static int event_enable_on_exec(struct perf_event *event,
+				struct perf_event_context *ctx)
+{
+	if (!event->attr.enable_on_exec)
+		return 0;
+
+	event->attr.enable_on_exec = 0;
+	if (event->state >= PERF_EVENT_STATE_INACTIVE)
+		return 0;
+
+	__perf_event_mark_enabled(event, ctx);
+
+	return 1;
 }
 
 /*
@@ -1499,6 +1691,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 	struct perf_event *event;
 	unsigned long flags;
 	int enabled = 0;
+	int ret;
 
 	local_irq_save(flags);
 	ctx = task->perf_event_ctxp;
@@ -1509,14 +1702,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 
 	raw_spin_lock(&ctx->lock);
 
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
-		if (!event->attr.enable_on_exec)
-			continue;
-		event->attr.enable_on_exec = 0;
-		if (event->state >= PERF_EVENT_STATE_INACTIVE)
-			continue;
-		__perf_event_mark_enabled(event, ctx);
-		enabled = 1;
+	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+		ret = event_enable_on_exec(event, ctx);
+		if (ret)
+			enabled = 1;
+	}
+
+	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+		ret = event_enable_on_exec(event, ctx);
+		if (ret)
+			enabled = 1;
 	}
 
 	/*
@@ -1527,7 +1722,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 
 	raw_spin_unlock(&ctx->lock);
 
-	perf_event_task_sched_in(task, smp_processor_id());
+	perf_event_task_sched_in(task);
  out:
 	local_irq_restore(flags);
 }
@@ -1559,6 +1754,11 @@ static void __perf_event_read(void *info)
 	event->pmu->read(event);
 }
 
+static inline u64 perf_event_count(struct perf_event *event)
+{
+	return local64_read(&event->count) + atomic64_read(&event->child_count);
+}
+
 static u64 perf_event_read(struct perf_event *event)
 {
 	/*
@@ -1578,7 +1778,7 @@ static u64 perf_event_read(struct perf_event *event)
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 
-	return atomic64_read(&event->count);
+	return perf_event_count(event);
 }
 
 /*
@@ -1590,7 +1790,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
 {
 	raw_spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
-	INIT_LIST_HEAD(&ctx->group_list);
+	INIT_LIST_HEAD(&ctx->pinned_groups);
+	INIT_LIST_HEAD(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
 	ctx->task = task;
@@ -1698,6 +1899,7 @@ static void free_event_rcu(struct rcu_head *head)
 }
 
 static void perf_pending_sync(struct perf_event *event);
+static void perf_buffer_put(struct perf_buffer *buffer);
 
 static void free_event(struct perf_event *event)
 {
@@ -1705,7 +1907,7 @@ static void free_event(struct perf_event *event)
 
 	if (!event->parent) {
 		atomic_dec(&nr_events);
-		if (event->attr.mmap)
+		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_dec(&nr_mmap_events);
 		if (event->attr.comm)
 			atomic_dec(&nr_comm_events);
@@ -1713,9 +1915,9 @@ static void free_event(struct perf_event *event)
 			atomic_dec(&nr_task_events);
 	}
 
-	if (event->output) {
-		fput(event->output->filp);
-		event->output = NULL;
+	if (event->buffer) {
+		perf_buffer_put(event->buffer);
+		event->buffer = NULL;
 	}
 
 	if (event->destroy)
@@ -1729,9 +1931,30 @@ int perf_event_release_kernel(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 
+	/*
+	 * Remove from the PMU, can't get re-enabled since we got
+	 * here because the last ref went.
+	 */
+	perf_event_disable(event);
+
 	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
-	perf_event_remove_from_context(event);
+	/*
+	 * There are two ways this annotation is useful:
+	 *
+	 *  1) there is a lock recursion from perf_event_exit_task
+	 *     see the comment there.
+	 *
+	 *  2) there is a lock-inversion with mmap_sem through
+	 *     perf_event_read_group(), which takes faults while
+	 *     holding ctx->mutex, however this is called after
+	 *     the last filedesc died, so there is no possibility
+	 *     to trigger the AB-BA case.
+	 */
+	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+	raw_spin_lock_irq(&ctx->lock);
+	perf_group_detach(event);
+	list_del_event(event, ctx);
+	raw_spin_unlock_irq(&ctx->lock);
 	mutex_unlock(&ctx->mutex);
 
 	mutex_lock(&event->owner->perf_event_mutex);
@@ -1919,13 +2142,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_event *event = file->private_data;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned int events = POLL_HUP;
 
 	rcu_read_lock();
-	data = rcu_dereference(event->data);
-	if (data)
-		events = atomic_xchg(&data->poll, 0);
+	buffer = rcu_dereference(event->buffer);
+	if (buffer)
+		events = atomic_xchg(&buffer->poll, 0);
 	rcu_read_unlock();
 
 	poll_wait(file, &event->waitq, wait);
@@ -1936,7 +2159,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 static void perf_event_reset(struct perf_event *event)
 {
 	(void)perf_event_read(event);
-	atomic64_set(&event->count, 0);
+	local64_set(&event->count, 0);
 	perf_event_update_userpage(event);
 }
 
@@ -1979,15 +2202,13 @@ static void perf_event_for_each(struct perf_event *event,
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
 	struct perf_event_context *ctx = event->ctx;
-	unsigned long size;
 	int ret = 0;
 	u64 value;
 
 	if (!event->attr.sample_period)
 		return -EINVAL;
 
-	size = copy_from_user(&value, arg, sizeof(value));
-	if (size != sizeof(value))
+	if (copy_from_user(&value, arg, sizeof(value)))
 		return -EFAULT;
 
 	if (!value)
@@ -2011,7 +2232,27 @@ unlock:
 	return ret;
 }
 
-static int perf_event_set_output(struct perf_event *event, int output_fd);
+static const struct file_operations perf_fops;
+
+static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+{
+	struct file *file;
+
+	file = fget_light(fd, fput_needed);
+	if (!file)
+		return ERR_PTR(-EBADF);
+
+	if (file->f_op != &perf_fops) {
+		fput_light(file, *fput_needed);
+		*fput_needed = 0;
+		return ERR_PTR(-EBADF);
+	}
+
+	return file->private_data;
+}
+
+static int perf_event_set_output(struct perf_event *event,
+				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -2038,7 +2279,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return perf_event_period(event, (u64 __user *)arg);
 
 	case PERF_EVENT_IOC_SET_OUTPUT:
-		return perf_event_set_output(event, arg);
+	{
+		struct perf_event *output_event = NULL;
+		int fput_needed = 0;
+		int ret;
+
+		if (arg != -1) {
+			output_event = perf_fget_light(arg, &fput_needed);
+			if (IS_ERR(output_event))
+				return PTR_ERR(output_event);
+		}
+
+		ret = perf_event_set_output(event, output_event);
+		if (output_event)
+			fput_light(output_event->filp, fput_needed);
+
+		return ret;
+	}
 
 	case PERF_EVENT_IOC_SET_FILTER:
 		return perf_event_set_filter(event, (void __user *)arg);
@@ -2099,14 +2356,14 @@ static int perf_event_index(struct perf_event *event)
 void perf_event_update_userpage(struct perf_event *event)
 {
 	struct perf_event_mmap_page *userpg;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 
 	rcu_read_lock();
-	data = rcu_dereference(event->data);
-	if (!data)
+	buffer = rcu_dereference(event->buffer);
+	if (!buffer)
 		goto unlock;
 
-	userpg = data->user_page;
+	userpg = buffer->user_page;
 
 	/*
 	 * Disable preemption so as to not let the corresponding user-space
@@ -2116,9 +2373,9 @@ void perf_event_update_userpage(struct perf_event *event)
 	++userpg->lock;
 	barrier();
 	userpg->index = perf_event_index(event);
-	userpg->offset = atomic64_read(&event->count);
+	userpg->offset = perf_event_count(event);
 	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		userpg->offset -= atomic64_read(&event->hw.prev_count);
+		userpg->offset -= local64_read(&event->hw.prev_count);
 
 	userpg->time_enabled = event->total_time_enabled +
 			atomic64_read(&event->child_total_time_enabled);
@@ -2133,9 +2390,23 @@ unlock:
 	rcu_read_unlock();
 }
 
-static unsigned long perf_data_size(struct perf_mmap_data *data)
+static unsigned long perf_data_size(struct perf_buffer *buffer);
+
+static void
+perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
 {
-	return data->nr_pages << (PAGE_SHIFT + data->data_order);
+	long max_size = perf_data_size(buffer);
+
+	if (watermark)
+		buffer->watermark = min(max_size, watermark);
+
+	if (!buffer->watermark)
+		buffer->watermark = max_size / 2;
+
+	if (flags & PERF_BUFFER_WRITABLE)
+		buffer->writable = 1;
+
+	atomic_set(&buffer->refcount, 1);
 }
 
 #ifndef CONFIG_PERF_USE_VMALLOC
@@ -2145,56 +2416,68 @@ static unsigned long perf_data_size(struct perf_mmap_data *data)
  */
 
 static struct page *
-perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
 {
-	if (pgoff > data->nr_pages)
+	if (pgoff > buffer->nr_pages)
 		return NULL;
 
 	if (pgoff == 0)
-		return virt_to_page(data->user_page);
+		return virt_to_page(buffer->user_page);
+
+	return virt_to_page(buffer->data_pages[pgoff - 1]);
+}
+
+static void *perf_mmap_alloc_page(int cpu)
+{
+	struct page *page;
+	int node;
 
-	return virt_to_page(data->data_pages[pgoff - 1]);
+	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+	if (!page)
+		return NULL;
+
+	return page_address(page);
 }
 
-static struct perf_mmap_data *
-perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+static struct perf_buffer *
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned long size;
 	int i;
 
-	WARN_ON(atomic_read(&event->mmap_count));
-
-	size = sizeof(struct perf_mmap_data);
+	size = sizeof(struct perf_buffer);
 	size += nr_pages * sizeof(void *);
 
-	data = kzalloc(size, GFP_KERNEL);
-	if (!data)
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
 		goto fail;
 
-	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!data->user_page)
+	buffer->user_page = perf_mmap_alloc_page(cpu);
+	if (!buffer->user_page)
 		goto fail_user_page;
 
 	for (i = 0; i < nr_pages; i++) {
-		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
-		if (!data->data_pages[i])
+		buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
+		if (!buffer->data_pages[i])
 			goto fail_data_pages;
 	}
 
-	data->data_order = 0;
-	data->nr_pages = nr_pages;
+	buffer->nr_pages = nr_pages;
 
-	return data;
+	perf_buffer_init(buffer, watermark, flags);
+
+	return buffer;
 
 fail_data_pages:
 	for (i--; i >= 0; i--)
-		free_page((unsigned long)data->data_pages[i]);
+		free_page((unsigned long)buffer->data_pages[i]);
 
-	free_page((unsigned long)data->user_page);
+	free_page((unsigned long)buffer->user_page);
 
 fail_user_page:
-	kfree(data);
+	kfree(buffer);
 
 fail:
 	return NULL;
@@ -2208,14 +2491,19 @@ static void perf_mmap_free_page(unsigned long addr)
 	__free_page(page);
 }
 
-static void perf_mmap_data_free(struct perf_mmap_data *data)
+static void perf_buffer_free(struct perf_buffer *buffer)
 {
 	int i;
 
-	perf_mmap_free_page((unsigned long)data->user_page);
-	for (i = 0; i < data->nr_pages; i++)
-		perf_mmap_free_page((unsigned long)data->data_pages[i]);
-	kfree(data);
+	perf_mmap_free_page((unsigned long)buffer->user_page);
+	for (i = 0; i < buffer->nr_pages; i++)
+		perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
+	kfree(buffer);
+}
+
+static inline int page_order(struct perf_buffer *buffer)
+{
+	return 0;
 }
 
 #else
@@ -2226,13 +2514,18 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
  * Required for architectures that have d-cache aliasing issues.
  */
 
+static inline int page_order(struct perf_buffer *buffer)
+{
+	return buffer->page_order;
+}
+
 static struct page *
-perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
 {
-	if (pgoff > (1UL << data->data_order))
+	if (pgoff > (1UL << page_order(buffer)))
 		return NULL;
 
-	return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
+	return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
 }
 
 static void perf_mmap_unmark_page(void *addr)
@@ -2242,59 +2535,59 @@ static void perf_mmap_unmark_page(void *addr)
 	page->mapping = NULL;
 }
 
-static void perf_mmap_data_free_work(struct work_struct *work)
+static void perf_buffer_free_work(struct work_struct *work)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	void *base;
 	int i, nr;
 
-	data = container_of(work, struct perf_mmap_data, work);
-	nr = 1 << data->data_order;
+	buffer = container_of(work, struct perf_buffer, work);
+	nr = 1 << page_order(buffer);
 
-	base = data->user_page;
+	base = buffer->user_page;
 	for (i = 0; i < nr + 1; i++)
 		perf_mmap_unmark_page(base + (i * PAGE_SIZE));
 
 	vfree(base);
-	kfree(data);
+	kfree(buffer);
 }
 
-static void perf_mmap_data_free(struct perf_mmap_data *data)
+static void perf_buffer_free(struct perf_buffer *buffer)
 {
-	schedule_work(&data->work);
+	schedule_work(&buffer->work);
 }
 
-static struct perf_mmap_data *
-perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+static struct perf_buffer *
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned long size;
 	void *all_buf;
 
-	WARN_ON(atomic_read(&event->mmap_count));
-
-	size = sizeof(struct perf_mmap_data);
+	size = sizeof(struct perf_buffer);
 	size += sizeof(void *);
 
-	data = kzalloc(size, GFP_KERNEL);
-	if (!data)
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
 		goto fail;
 
-	INIT_WORK(&data->work, perf_mmap_data_free_work);
+	INIT_WORK(&buffer->work, perf_buffer_free_work);
 
 	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
 	if (!all_buf)
 		goto fail_all_buf;
 
-	data->user_page = all_buf;
-	data->data_pages[0] = all_buf + PAGE_SIZE;
-	data->data_order = ilog2(nr_pages);
-	data->nr_pages = 1;
+	buffer->user_page = all_buf;
+	buffer->data_pages[0] = all_buf + PAGE_SIZE;
+	buffer->page_order = ilog2(nr_pages);
+	buffer->nr_pages = 1;
 
-	return data;
+	perf_buffer_init(buffer, watermark, flags);
+
+	return buffer;
 
 fail_all_buf:
-	kfree(data);
+	kfree(buffer);
 
 fail:
 	return NULL;
@@ -2302,10 +2595,15 @@ fail:
 
 #endif
 
+static unsigned long perf_data_size(struct perf_buffer *buffer)
+{
+	return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
+}
+
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct perf_event *event = vma->vm_file->private_data;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	int ret = VM_FAULT_SIGBUS;
 
 	if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -2315,14 +2613,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 
 	rcu_read_lock();
-	data = rcu_dereference(event->data);
-	if (!data)
+	buffer = rcu_dereference(event->buffer);
+	if (!buffer)
 		goto unlock;
 
 	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
 		goto unlock;
 
-	vmf->page = perf_mmap_to_page(data, vmf->pgoff);
+	vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
 	if (!vmf->page)
 		goto unlock;
 
@@ -2337,41 +2635,35 @@ unlock:
 	return ret;
 }
 
-static void
-perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
+static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
 {
-	long max_size = perf_data_size(data);
-
-	atomic_set(&data->lock, -1);
-
-	if (event->attr.watermark) {
-		data->watermark = min_t(long, max_size,
-					event->attr.wakeup_watermark);
-	}
-
-	if (!data->watermark)
-		data->watermark = max_size / 2;
+	struct perf_buffer *buffer;
 
-
-	rcu_assign_pointer(event->data, data);
+	buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
+	perf_buffer_free(buffer);
 }
 
-static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
+static struct perf_buffer *perf_buffer_get(struct perf_event *event)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
+
+	rcu_read_lock();
+	buffer = rcu_dereference(event->buffer);
+	if (buffer) {
+		if (!atomic_inc_not_zero(&buffer->refcount))
+			buffer = NULL;
+	}
+	rcu_read_unlock();
 
-	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
-	perf_mmap_data_free(data);
+	return buffer;
 }
 
-static void perf_mmap_data_release(struct perf_event *event)
+static void perf_buffer_put(struct perf_buffer *buffer)
 {
-	struct perf_mmap_data *data = event->data;
-
-	WARN_ON(atomic_read(&event->mmap_count));
+	if (!atomic_dec_and_test(&buffer->refcount))
+		return;
 
-	rcu_assign_pointer(event->data, NULL);
-	call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
+	call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
 }
 
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2385,15 +2677,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 {
 	struct perf_event *event = vma->vm_file->private_data;
 
-	WARN_ON_ONCE(event->ctx->parent_ctx);
 	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-		unsigned long size = perf_data_size(event->data);
-		struct user_struct *user = current_user();
+		unsigned long size = perf_data_size(event->buffer);
+		struct user_struct *user = event->mmap_user;
+		struct perf_buffer *buffer = event->buffer;
 
 		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-		vma->vm_mm->locked_vm -= event->data->nr_locked;
-		perf_mmap_data_release(event);
+		vma->vm_mm->locked_vm -= event->mmap_locked;
+		rcu_assign_pointer(event->buffer, NULL);
 		mutex_unlock(&event->mmap_mutex);
+
+		perf_buffer_put(buffer);
+		free_uid(user);
 	}
 }
 
@@ -2410,11 +2705,19 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long user_locked, user_lock_limit;
 	struct user_struct *user = current_user();
 	unsigned long locked, lock_limit;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned long vma_size;
 	unsigned long nr_pages;
 	long user_extra, extra;
-	int ret = 0;
+	int ret = 0, flags = 0;
+
+	/*
+	 * Don't allow mmap() of inherited per-task counters. This would
+	 * create a performance issue due to all children writing to the
+	 * same buffer.
+	 */
+	if (event->cpu == -1 && event->attr.inherit)
+		return -EINVAL;
 
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
@@ -2423,7 +2726,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	nr_pages = (vma_size / PAGE_SIZE) - 1;
 
 	/*
-	 * If we have data pages ensure they're a power-of-two number, so we
+	 * If we have buffer pages ensure they're a power-of-two number, so we
 	 * can do bitmasks instead of modulo.
 	 */
 	if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -2437,13 +2740,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 	WARN_ON_ONCE(event->ctx->parent_ctx);
 	mutex_lock(&event->mmap_mutex);
-	if (event->output) {
-		ret = -EINVAL;
-		goto unlock;
-	}
-
-	if (atomic_inc_not_zero(&event->mmap_count)) {
-		if (nr_pages != event->data->nr_pages)
+	if (event->buffer) {
+		if (event->buffer->nr_pages == nr_pages)
+			atomic_inc(&event->buffer->refcount);
+		else
 			ret = -EINVAL;
 		goto unlock;
 	}
@@ -2462,7 +2762,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	if (user_locked > user_lock_limit)
 		extra = user_locked - user_lock_limit;
 
-	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit = rlimit(RLIMIT_MEMLOCK);
 	lock_limit >>= PAGE_SHIFT;
 	locked = vma->vm_mm->locked_vm + extra;
 
@@ -2472,24 +2772,27 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
-	WARN_ON(event->data);
+	WARN_ON(event->buffer);
 
-	data = perf_mmap_data_alloc(event, nr_pages);
-	ret = -ENOMEM;
-	if (!data)
-		goto unlock;
+	if (vma->vm_flags & VM_WRITE)
+		flags |= PERF_BUFFER_WRITABLE;
 
-	ret = 0;
-	perf_mmap_data_init(event, data);
+	buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
+				   event->cpu, flags);
+	if (!buffer) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+	rcu_assign_pointer(event->buffer, buffer);
 
-	atomic_set(&event->mmap_count, 1);
 	atomic_long_add(user_extra, &user->locked_vm);
-	vma->vm_mm->locked_vm += extra;
-	event->data->nr_locked = extra;
-	if (vma->vm_flags & VM_WRITE)
-		event->data->writable = 1;
+	event->mmap_locked = extra;
+	event->mmap_user = get_current_user();
+	vma->vm_mm->locked_vm += event->mmap_locked;
 
 unlock:
+	if (!ret)
+		atomic_inc(&event->mmap_count);
 	mutex_unlock(&event->mmap_mutex);
 
 	vma->vm_flags |= VM_RESERVED;
@@ -2515,6 +2818,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
 }
 
 static const struct file_operations perf_fops = {
+	.llseek			= no_llseek,
 	.release		= perf_release,
 	.read			= perf_read,
 	.poll			= perf_poll,
@@ -2658,18 +2962,40 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return NULL;
 }
 
+
+/*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+	perf_guest_cbs = cbs;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+	perf_guest_cbs = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+
 /*
  * Output
  */
-static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
 			      unsigned long offset, unsigned long head)
 {
 	unsigned long mask;
 
-	if (!data->writable)
+	if (!buffer->writable)
 		return true;
 
-	mask = perf_data_size(data) - 1;
+	mask = perf_data_size(buffer) - 1;
 
 	offset = (offset - tail) & mask;
 	head   = (head   - tail) & mask;
@@ -2682,7 +3008,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
 
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
-	atomic_set(&handle->data->poll, POLL_IN);
+	atomic_set(&handle->buffer->poll, POLL_IN);
 
 	if (handle->nmi) {
 		handle->event->pending_wakeup = 1;
@@ -2693,128 +3019,88 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 }
 
 /*
- * Curious locking construct.
- *
  * We need to ensure a later event_id doesn't publish a head when a former
- * event_id isn't done writing. However since we need to deal with NMIs we
+ * event isn't done writing. However since we need to deal with NMIs we
  * cannot fully serialize things.
  *
- * What we do is serialize between CPUs so we only have to deal with NMI
- * nesting on a single CPU.
- *
  * We only publish the head (and generate a wakeup) when the outer-most
- * event_id completes.
+ * event completes.
  */
-static void perf_output_lock(struct perf_output_handle *handle)
+static void perf_output_get_handle(struct perf_output_handle *handle)
 {
-	struct perf_mmap_data *data = handle->data;
-	int cur, cpu = get_cpu();
-
-	handle->locked = 0;
-
-	for (;;) {
-		cur = atomic_cmpxchg(&data->lock, -1, cpu);
-		if (cur == -1) {
-			handle->locked = 1;
-			break;
-		}
-		if (cur == cpu)
-			break;
+	struct perf_buffer *buffer = handle->buffer;
 
-		cpu_relax();
-	}
+	preempt_disable();
+	local_inc(&buffer->nest);
+	handle->wakeup = local_read(&buffer->wakeup);
 }
 
-static void perf_output_unlock(struct perf_output_handle *handle)
+static void perf_output_put_handle(struct perf_output_handle *handle)
 {
-	struct perf_mmap_data *data = handle->data;
+	struct perf_buffer *buffer = handle->buffer;
 	unsigned long head;
-	int cpu;
-
-	data->done_head = data->head;
-
-	if (!handle->locked)
-		goto out;
 
 again:
-	/*
-	 * The xchg implies a full barrier that ensures all writes are done
-	 * before we publish the new head, matched by a rmb() in userspace when
-	 * reading this position.
-	 */
-	while ((head = atomic_long_xchg(&data->done_head, 0)))
-		data->user_page->data_head = head;
+	head = local_read(&buffer->head);
 
 	/*
-	 * NMI can happen here, which means we can miss a done_head update.
+	 * IRQ/NMI can happen here, which means we can miss a head update.
 	 */
 
-	cpu = atomic_xchg(&data->lock, -1);
-	WARN_ON_ONCE(cpu != smp_processor_id());
+	if (!local_dec_and_test(&buffer->nest))
+		goto out;
 
 	/*
-	 * Therefore we have to validate we did not indeed do so.
+	 * Publish the known good head. Rely on the full barrier implied
+	 * by atomic_dec_and_test() order the buffer->head read and this
+	 * write.
 	 */
-	if (unlikely(atomic_long_read(&data->done_head))) {
-		/*
-		 * Since we had it locked, we can lock it again.
-		 */
-		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
-			cpu_relax();
+	buffer->user_page->data_head = head;
 
+	/*
+	 * Now check if we missed an update, rely on the (compiler)
+	 * barrier in atomic_dec_and_test() to re-read buffer->head.
+	 */
+	if (unlikely(head != local_read(&buffer->head))) {
+		local_inc(&buffer->nest);
 		goto again;
 	}
 
-	if (atomic_xchg(&data->wakeup, 0))
+	if (handle->wakeup != local_read(&buffer->wakeup))
 		perf_output_wakeup(handle);
-out:
-	put_cpu();
+
+ out:
+	preempt_enable();
 }
 
-void perf_output_copy(struct perf_output_handle *handle,
+__always_inline void perf_output_copy(struct perf_output_handle *handle,
 		      const void *buf, unsigned int len)
 {
-	unsigned int pages_mask;
-	unsigned long offset;
-	unsigned int size;
-	void **pages;
-
-	offset		= handle->offset;
-	pages_mask	= handle->data->nr_pages - 1;
-	pages		= handle->data->data_pages;
-
 	do {
-		unsigned long page_offset;
-		unsigned long page_size;
-		int nr;
+		unsigned long size = min_t(unsigned long, handle->size, len);
 
-		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
-		page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
-		page_offset = offset & (page_size - 1);
-		size	    = min_t(unsigned int, page_size - page_offset, len);
+		memcpy(handle->addr, buf, size);
 
-		memcpy(pages[nr] + page_offset, buf, size);
+		len -= size;
+		handle->addr += size;
+		buf += size;
+		handle->size -= size;
+		if (!handle->size) {
+			struct perf_buffer *buffer = handle->buffer;
 
-		len	    -= size;
-		buf	    += size;
-		offset	    += size;
+			handle->page++;
+			handle->page &= buffer->nr_pages - 1;
+			handle->addr = buffer->data_pages[handle->page];
+			handle->size = PAGE_SIZE << page_order(buffer);
+		}
 	} while (len);
-
-	handle->offset = offset;
-
-	/*
-	 * Check we didn't copy past our reservation window, taking the
-	 * possible unsigned int wrap into account.
-	 */
-	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 
 int perf_output_begin(struct perf_output_handle *handle,
 		      struct perf_event *event, unsigned int size,
 		      int nmi, int sample)
 {
-	struct perf_event *output_event;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned long tail, offset, head;
 	int have_lost;
 	struct {
@@ -2830,27 +3116,23 @@ int perf_output_begin(struct perf_output_handle *handle,
 	if (event->parent)
 		event = event->parent;
 
-	output_event = rcu_dereference(event->output);
-	if (output_event)
-		event = output_event;
-
-	data = rcu_dereference(event->data);
-	if (!data)
+	buffer = rcu_dereference(event->buffer);
+	if (!buffer)
 		goto out;
 
-	handle->data	= data;
+	handle->buffer	= buffer;
 	handle->event	= event;
 	handle->nmi	= nmi;
 	handle->sample	= sample;
 
-	if (!data->nr_pages)
-		goto fail;
+	if (!buffer->nr_pages)
+		goto out;
 
-	have_lost = atomic_read(&data->lost);
+	have_lost = local_read(&buffer->lost);
 	if (have_lost)
 		size += sizeof(lost_event);
 
-	perf_output_lock(handle);
+	perf_output_get_handle(handle);
 
 	do {
 		/*
@@ -2858,26 +3140,30 @@ int perf_output_begin(struct perf_output_handle *handle,
 		 * tail pointer. So that all reads will be completed before the
 		 * write is issued.
 		 */
-		tail = ACCESS_ONCE(data->user_page->data_tail);
+		tail = ACCESS_ONCE(buffer->user_page->data_tail);
 		smp_rmb();
-		offset = head = atomic_long_read(&data->head);
+		offset = head = local_read(&buffer->head);
 		head += size;
-		if (unlikely(!perf_output_space(data, tail, offset, head)))
+		if (unlikely(!perf_output_space(buffer, tail, offset, head)))
 			goto fail;
-	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+	} while (local_cmpxchg(&buffer->head, offset, head) != offset);
 
-	handle->offset	= offset;
-	handle->head	= head;
+	if (head - local_read(&buffer->wakeup) > buffer->watermark)
+		local_add(buffer->watermark, &buffer->wakeup);
 
-	if (head - tail > data->watermark)
-		atomic_set(&data->wakeup, 1);
+	handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
+	handle->page &= buffer->nr_pages - 1;
+	handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
+	handle->addr = buffer->data_pages[handle->page];
+	handle->addr += handle->size;
+	handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
 
 	if (have_lost) {
 		lost_event.header.type = PERF_RECORD_LOST;
 		lost_event.header.misc = 0;
 		lost_event.header.size = sizeof(lost_event);
 		lost_event.id          = event->id;
-		lost_event.lost        = atomic_xchg(&data->lost, 0);
+		lost_event.lost        = local_xchg(&buffer->lost, 0);
 
 		perf_output_put(handle, lost_event);
 	}
@@ -2885,8 +3171,8 @@ int perf_output_begin(struct perf_output_handle *handle,
 	return 0;
 
 fail:
-	atomic_inc(&data->lost);
-	perf_output_unlock(handle);
+	local_inc(&buffer->lost);
+	perf_output_put_handle(handle);
 out:
 	rcu_read_unlock();
 
@@ -2896,19 +3182,19 @@ out:
 void perf_output_end(struct perf_output_handle *handle)
 {
 	struct perf_event *event = handle->event;
-	struct perf_mmap_data *data = handle->data;
+	struct perf_buffer *buffer = handle->buffer;
 
 	int wakeup_events = event->attr.wakeup_events;
 
 	if (handle->sample && wakeup_events) {
-		int events = atomic_inc_return(&data->events);
+		int events = local_inc_return(&buffer->events);
 		if (events >= wakeup_events) {
-			atomic_sub(wakeup_events, &data->events);
-			atomic_set(&data->wakeup, 1);
+			local_sub(wakeup_events, &buffer->events);
+			local_inc(&buffer->wakeup);
 		}
 	}
 
-	perf_output_unlock(handle);
+	perf_output_put_handle(handle);
 	rcu_read_unlock();
 }
 
@@ -2941,7 +3227,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 	u64 values[4];
 	int n = 0;
 
-	values[n++] = atomic64_read(&event->count);
+	values[n++] = perf_event_count(event);
 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
 		values[n++] = event->total_time_enabled +
 			atomic64_read(&event->child_total_time_enabled);
@@ -2978,7 +3264,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	if (leader != event)
 		leader->pmu->read(leader);
 
-	values[n++] = atomic64_read(&leader->count);
+	values[n++] = perf_event_count(leader);
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(leader);
 
@@ -2990,7 +3276,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 		if (sub != event)
 			sub->pmu->read(sub);
 
-		values[n++] = atomic64_read(&sub->count);
+		values[n++] = perf_event_count(sub);
 		if (read_format & PERF_FORMAT_ID)
 			values[n++] = primary_event_id(sub);
 
@@ -3221,7 +3507,7 @@ perf_event_read_event(struct perf_event *event,
 /*
  * task tracking -- fork/exit
  *
- * enabled by: attr.comm | attr.mmap | attr.task
+ * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
  */
 
 struct perf_task_event {
@@ -3243,9 +3529,8 @@ static void perf_event_task_output(struct perf_event *event,
 				     struct perf_task_event *task_event)
 {
 	struct perf_output_handle handle;
-	int size;
 	struct task_struct *task = task_event->task;
-	int ret;
+	int size, ret;
 
 	size  = task_event->event_id.header.size;
 	ret = perf_output_begin(&handle, event, size, 0, 0);
@@ -3259,8 +3544,6 @@ static void perf_event_task_output(struct perf_event *event,
 	task_event->event_id.tid = perf_event_tid(event, task);
 	task_event->event_id.ptid = perf_event_tid(event, current);
 
-	task_event->event_id.time = perf_clock();
-
 	perf_output_put(&handle, task_event->event_id);
 
 	perf_output_end(&handle);
@@ -3268,13 +3551,14 @@ static void perf_event_task_output(struct perf_event *event,
 
 static int perf_event_task_match(struct perf_event *event)
 {
-	if (event->state != PERF_EVENT_STATE_ACTIVE)
+	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
 	if (event->cpu != -1 && event->cpu != smp_processor_id())
 		return 0;
 
-	if (event->attr.comm || event->attr.mmap || event->attr.task)
+	if (event->attr.comm || event->attr.mmap ||
+	    event->attr.mmap_data || event->attr.task)
 		return 1;
 
 	return 0;
@@ -3300,7 +3584,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_event_task_ctx(&cpuctx->ctx, task_event);
 	if (!ctx)
-		ctx = rcu_dereference(task_event->task->perf_event_ctxp);
+		ctx = rcu_dereference(current->perf_event_ctxp);
 	if (ctx)
 		perf_event_task_ctx(ctx, task_event);
 	put_cpu_var(perf_cpu_context);
@@ -3331,6 +3615,7 @@ static void perf_event_task(struct task_struct *task,
 			/* .ppid */
 			/* .tid  */
 			/* .ptid */
+			.time = perf_clock(),
 		},
 	};
 
@@ -3380,7 +3665,7 @@ static void perf_event_comm_output(struct perf_event *event,
 
 static int perf_event_comm_match(struct perf_event *event)
 {
-	if (event->state != PERF_EVENT_STATE_ACTIVE)
+	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
 	if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3498,27 +3783,30 @@ static void perf_event_mmap_output(struct perf_event *event,
 }
 
 static int perf_event_mmap_match(struct perf_event *event,
-				   struct perf_mmap_event *mmap_event)
+				   struct perf_mmap_event *mmap_event,
+				   int executable)
 {
-	if (event->state != PERF_EVENT_STATE_ACTIVE)
+	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
 	if (event->cpu != -1 && event->cpu != smp_processor_id())
 		return 0;
 
-	if (event->attr.mmap)
+	if ((!executable && event->attr.mmap_data) ||
+	    (executable && event->attr.mmap))
 		return 1;
 
 	return 0;
 }
 
 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
-				  struct perf_mmap_event *mmap_event)
+				  struct perf_mmap_event *mmap_event,
+				  int executable)
 {
 	struct perf_event *event;
 
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-		if (perf_event_mmap_match(event, mmap_event))
+		if (perf_event_mmap_match(event, mmap_event, executable))
 			perf_event_mmap_output(event, mmap_event);
 	}
 }
@@ -3562,6 +3850,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 		if (!vma->vm_mm) {
 			name = strncpy(tmp, "[vdso]", sizeof(tmp));
 			goto got_name;
+		} else if (vma->vm_start <= vma->vm_mm->start_brk &&
+				vma->vm_end >= vma->vm_mm->brk) {
+			name = strncpy(tmp, "[heap]", sizeof(tmp));
+			goto got_name;
+		} else if (vma->vm_start <= vma->vm_mm->start_stack &&
+				vma->vm_end >= vma->vm_mm->start_stack) {
+			name = strncpy(tmp, "[stack]", sizeof(tmp));
+			goto got_name;
 		}
 
 		name = strncpy(tmp, "//anon", sizeof(tmp));
@@ -3578,17 +3874,17 @@ got_name:
 
 	rcu_read_lock();
 	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
+	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
 	ctx = rcu_dereference(current->perf_event_ctxp);
 	if (ctx)
-		perf_event_mmap_ctx(ctx, mmap_event);
+		perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
 	put_cpu_var(perf_cpu_context);
 	rcu_read_unlock();
 
 	kfree(buf);
 }
 
-void __perf_event_mmap(struct vm_area_struct *vma)
+void perf_event_mmap(struct vm_area_struct *vma)
 {
 	struct perf_mmap_event mmap_event;
 
@@ -3602,14 +3898,14 @@ void __perf_event_mmap(struct vm_area_struct *vma)
 		.event_id  = {
 			.header = {
 				.type = PERF_RECORD_MMAP,
-				.misc = 0,
+				.misc = PERF_RECORD_MISC_USER,
 				/* .size */
 			},
 			/* .pid */
 			/* .tid */
 			.start  = vma->vm_start,
 			.len    = vma->vm_end - vma->vm_start,
-			.pgoff  = vma->vm_pgoff,
+			.pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
 		},
 	};
 
@@ -3689,12 +3985,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
 
 	if (event->attr.freq) {
 		u64 now = perf_clock();
-		s64 delta = now - hwc->freq_stamp;
+		s64 delta = now - hwc->freq_time_stamp;
 
-		hwc->freq_stamp = now;
+		hwc->freq_time_stamp = now;
 
-		if (delta > 0 && delta < TICK_NSEC)
-			perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
+		if (delta > 0 && delta < 2*TICK_NSEC)
+			perf_adjust_period(event, delta, hwc->last_period);
 	}
 
 	/*
@@ -3750,14 +4046,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
 	hwc->last_period = hwc->sample_period;
 
 again:
-	old = val = atomic64_read(&hwc->period_left);
+	old = val = local64_read(&hwc->period_left);
 	if (val < 0)
 		return 0;
 
 	nr = div64_u64(period + val, period);
 	offset = nr * period;
 	val -= offset;
-	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+	if (local64_cmpxchg(&hwc->period_left, old, val) != old)
 		goto again;
 
 	return nr;
@@ -3790,20 +4086,13 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
 	}
 }
 
-static void perf_swevent_unthrottle(struct perf_event *event)
-{
-	/*
-	 * Nothing to do, we already reset hwc->interrupts.
-	 */
-}
-
 static void perf_swevent_add(struct perf_event *event, u64 nr,
 			       int nmi, struct perf_sample_data *data,
 			       struct pt_regs *regs)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
-	atomic64_add(nr, &event->count);
+	local64_add(nr, &event->count);
 
 	if (!regs)
 		return;
@@ -3814,45 +4103,12 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
 		return perf_swevent_overflow(event, 1, nmi, data, regs);
 
-	if (atomic64_add_negative(nr, &hwc->period_left))
+	if (local64_add_negative(nr, &hwc->period_left))
 		return;
 
 	perf_swevent_overflow(event, 0, nmi, data, regs);
 }
 
-static int perf_swevent_is_counting(struct perf_event *event)
-{
-	/*
-	 * The event is active, we're good!
-	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		return 1;
-
-	/*
-	 * The event is off/error, not counting.
-	 */
-	if (event->state != PERF_EVENT_STATE_INACTIVE)
-		return 0;
-
-	/*
-	 * The event is inactive, if the context is active
-	 * we're part of a group that didn't make it on the 'pmu',
-	 * not counting.
-	 */
-	if (event->ctx->is_active)
-		return 0;
-
-	/*
-	 * We're inactive and the context is too, this means the
-	 * task is scheduled out, we're counting events that happen
-	 * to us, like migration events.
-	 */
-	return 1;
-}
-
-static int perf_tp_event_match(struct perf_event *event,
-				struct perf_sample_data *data);
-
 static int perf_exclude_event(struct perf_event *event,
 			      struct pt_regs *regs)
 {
@@ -3873,12 +4129,6 @@ static int perf_swevent_match(struct perf_event *event,
 				struct perf_sample_data *data,
 				struct pt_regs *regs)
 {
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
-		return 0;
-
-	if (!perf_swevent_is_counting(event))
-		return 0;
-
 	if (event->attr.type != type)
 		return 0;
 
@@ -3888,30 +4138,88 @@ static int perf_swevent_match(struct perf_event *event,
 	if (perf_exclude_event(event, regs))
 		return 0;
 
-	if (event->attr.type == PERF_TYPE_TRACEPOINT &&
-	    !perf_tp_event_match(event, data))
-		return 0;
-
 	return 1;
 }
 
-static void perf_swevent_ctx_event(struct perf_event_context *ctx,
-				     enum perf_type_id type,
-				     u32 event_id, u64 nr, int nmi,
-				     struct perf_sample_data *data,
-				     struct pt_regs *regs)
+static inline u64 swevent_hash(u64 type, u32 event_id)
+{
+	u64 val = event_id | (type << 32);
+
+	return hash_64(val, SWEVENT_HLIST_BITS);
+}
+
+static inline struct hlist_head *
+__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
+{
+	u64 hash = swevent_hash(type, event_id);
+
+	return &hlist->heads[hash];
+}
+
+/* For the read side: events when they trigger */
+static inline struct hlist_head *
+find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+{
+	struct swevent_hlist *hlist;
+
+	hlist = rcu_dereference(ctx->swevent_hlist);
+	if (!hlist)
+		return NULL;
+
+	return __find_swevent_head(hlist, type, event_id);
+}
+
+/* For the event head insertion and removal in the hlist */
+static inline struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+{
+	struct swevent_hlist *hlist;
+	u32 event_id = event->attr.config;
+	u64 type = event->attr.type;
+
+	/*
+	 * Event scheduling is always serialized against hlist allocation
+	 * and release. Which makes the protected version suitable here.
+	 * The context lock guarantees that.
+	 */
+	hlist = rcu_dereference_protected(ctx->swevent_hlist,
+					  lockdep_is_held(&event->ctx->lock));
+	if (!hlist)
+		return NULL;
+
+	return __find_swevent_head(hlist, type, event_id);
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+				    u64 nr, int nmi,
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
 {
+	struct perf_cpu_context *cpuctx;
 	struct perf_event *event;
+	struct hlist_node *node;
+	struct hlist_head *head;
 
-	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+	cpuctx = &__get_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+
+	head = find_swevent_head_rcu(cpuctx, type, event_id);
+
+	if (!head)
+		goto end;
+
+	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
 		if (perf_swevent_match(event, type, event_id, data, regs))
 			perf_swevent_add(event, nr, nmi, data, regs);
 	}
+end:
+	rcu_read_unlock();
 }
 
 int perf_swevent_get_recursion_context(void)
 {
-	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	int rctx;
 
 	if (in_nmi())
@@ -3923,10 +4231,8 @@ int perf_swevent_get_recursion_context(void)
 	else
 		rctx = 0;
 
-	if (cpuctx->recursion[rctx]) {
-		put_cpu_var(perf_cpu_context);
+	if (cpuctx->recursion[rctx])
 		return -1;
-	}
 
 	cpuctx->recursion[rctx]++;
 	barrier();
@@ -3935,35 +4241,11 @@ int perf_swevent_get_recursion_context(void)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
-void perf_swevent_put_recursion_context(int rctx)
+void inline perf_swevent_put_recursion_context(int rctx)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	barrier();
 	cpuctx->recursion[rctx]--;
-	put_cpu_var(perf_cpu_context);
-}
-EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
-
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-				    u64 nr, int nmi,
-				    struct perf_sample_data *data,
-				    struct pt_regs *regs)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx;
-
-	cpuctx = &__get_cpu_var(perf_cpu_context);
-	rcu_read_lock();
-	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
-				 nr, nmi, data, regs);
-	/*
-	 * doesn't really matter which of the child contexts the
-	 * events ends up in.
-	 */
-	ctx = rcu_dereference(current->perf_event_ctxp);
-	if (ctx)
-		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
-	rcu_read_unlock();
 }
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -3972,16 +4254,17 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
 	struct perf_sample_data data;
 	int rctx;
 
+	preempt_disable_notrace();
 	rctx = perf_swevent_get_recursion_context();
 	if (rctx < 0)
 		return;
 
-	data.addr = addr;
-	data.raw  = NULL;
+	perf_sample_data_init(&data, addr);
 
 	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
 
 	perf_swevent_put_recursion_context(rctx);
+	preempt_enable_notrace();
 }
 
 static void perf_swevent_read(struct perf_event *event)
@@ -3991,23 +4274,46 @@ static void perf_swevent_read(struct perf_event *event)
 static int perf_swevent_enable(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
+	struct perf_cpu_context *cpuctx;
+	struct hlist_head *head;
+
+	cpuctx = &__get_cpu_var(perf_cpu_context);
 
 	if (hwc->sample_period) {
 		hwc->last_period = hwc->sample_period;
 		perf_swevent_set_period(event);
 	}
+
+	head = find_swevent_head(cpuctx, event);
+	if (WARN_ON_ONCE(!head))
+		return -EINVAL;
+
+	hlist_add_head_rcu(&event->hlist_entry, head);
+
 	return 0;
 }
 
 static void perf_swevent_disable(struct perf_event *event)
 {
+	hlist_del_rcu(&event->hlist_entry);
+}
+
+static void perf_swevent_void(struct perf_event *event)
+{
+}
+
+static int perf_swevent_int(struct perf_event *event)
+{
+	return 0;
 }
 
 static const struct pmu perf_ops_generic = {
 	.enable		= perf_swevent_enable,
 	.disable	= perf_swevent_disable,
+	.start		= perf_swevent_int,
+	.stop		= perf_swevent_void,
 	.read		= perf_swevent_read,
-	.unthrottle	= perf_swevent_unthrottle,
+	.unthrottle	= perf_swevent_void, /* hwc->interrupts already reset */
 };
 
 /*
@@ -4022,22 +4328,14 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 	struct perf_event *event;
 	u64 period;
 
-	event	= container_of(hrtimer, struct perf_event, hw.hrtimer);
+	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
 	event->pmu->read(event);
 
-	data.addr = 0;
-	data.raw = NULL;
+	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 	regs = get_irq_regs();
-	/*
-	 * In case we exclude kernel IPs or are somehow not in interrupt
-	 * context, provide the next best thing, the user IP.
-	 */
-	if ((event->attr.exclude_kernel || !regs) &&
-			!event->attr.exclude_user)
-		regs = task_pt_regs(current);
 
-	if (regs) {
+	if (regs && !perf_exclude_event(event, regs)) {
 		if (!(event->attr.exclude_idle && current->pid == 0))
 			if (perf_event_overflow(event, 0, &data, regs))
 				ret = HRTIMER_NORESTART;
@@ -4096,8 +4394,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
 	u64 now;
 
 	now = cpu_clock(cpu);
-	prev = atomic64_xchg(&event->hw.prev_count, now);
-	atomic64_add(now - prev, &event->count);
+	prev = local64_xchg(&event->hw.prev_count, now);
+	local64_add(now - prev, &event->count);
 }
 
 static int cpu_clock_perf_event_enable(struct perf_event *event)
@@ -4105,7 +4403,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int cpu = raw_smp_processor_id();
 
-	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+	local64_set(&hwc->prev_count, cpu_clock(cpu));
 	perf_swevent_start_hrtimer(event);
 
 	return 0;
@@ -4137,9 +4435,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now)
 	u64 prev;
 	s64 delta;
 
-	prev = atomic64_xchg(&event->hw.prev_count, now);
+	prev = local64_xchg(&event->hw.prev_count, now);
 	delta = now - prev;
-	atomic64_add(delta, &event->count);
+	local64_add(delta, &event->count);
 }
 
 static int task_clock_perf_event_enable(struct perf_event *event)
@@ -4149,7 +4447,7 @@ static int task_clock_perf_event_enable(struct perf_event *event)
 
 	now = event->ctx->time;
 
-	atomic64_set(&hwc->prev_count, now);
+	local64_set(&hwc->prev_count, now);
 
 	perf_swevent_start_hrtimer(event);
 
@@ -4185,33 +4483,124 @@ static const struct pmu perf_ops_task_clock = {
 	.read		= task_clock_perf_event_read,
 };
 
-#ifdef CONFIG_EVENT_PROFILE
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+{
+	return rcu_dereference_protected(cpuctx->swevent_hlist,
+					 lockdep_is_held(&cpuctx->hlist_mutex));
+}
 
-void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
-			  int entry_size)
+static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
 {
-	struct perf_raw_record raw = {
-		.size = entry_size,
-		.data = record,
-	};
+	struct swevent_hlist *hlist;
 
-	struct perf_sample_data data = {
-		.addr = addr,
-		.raw = &raw,
-	};
+	hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
+	kfree(hlist);
+}
 
-	struct pt_regs *regs = get_irq_regs();
+static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+{
+	struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
 
-	if (!regs)
-		regs = task_pt_regs(current);
+	if (!hlist)
+		return;
 
-	/* Trace events already protected against recursion */
-	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-				&data, regs);
+	rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+	call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
 }
-EXPORT_SYMBOL_GPL(perf_tp_event);
 
-static int perf_tp_event_match(struct perf_event *event,
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+
+	mutex_lock(&cpuctx->hlist_mutex);
+
+	if (!--cpuctx->hlist_refcount)
+		swevent_hlist_release(cpuctx);
+
+	mutex_unlock(&cpuctx->hlist_mutex);
+}
+
+static void swevent_hlist_put(struct perf_event *event)
+{
+	int cpu;
+
+	if (event->cpu != -1) {
+		swevent_hlist_put_cpu(event, event->cpu);
+		return;
+	}
+
+	for_each_possible_cpu(cpu)
+		swevent_hlist_put_cpu(event, cpu);
+}
+
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	int err = 0;
+
+	mutex_lock(&cpuctx->hlist_mutex);
+
+	if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
+		struct swevent_hlist *hlist;
+
+		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+		if (!hlist) {
+			err = -ENOMEM;
+			goto exit;
+		}
+		rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+	}
+	cpuctx->hlist_refcount++;
+ exit:
+	mutex_unlock(&cpuctx->hlist_mutex);
+
+	return err;
+}
+
+static int swevent_hlist_get(struct perf_event *event)
+{
+	int err;
+	int cpu, failed_cpu;
+
+	if (event->cpu != -1)
+		return swevent_hlist_get_cpu(event, event->cpu);
+
+	get_online_cpus();
+	for_each_possible_cpu(cpu) {
+		err = swevent_hlist_get_cpu(event, cpu);
+		if (err) {
+			failed_cpu = cpu;
+			goto fail;
+		}
+	}
+	put_online_cpus();
+
+	return 0;
+ fail:
+	for_each_possible_cpu(cpu) {
+		if (cpu == failed_cpu)
+			break;
+		swevent_hlist_put_cpu(event, cpu);
+	}
+
+	put_online_cpus();
+	return err;
+}
+
+#ifdef CONFIG_EVENT_TRACING
+
+static const struct pmu perf_ops_tracepoint = {
+	.enable		= perf_trace_enable,
+	.disable	= perf_trace_disable,
+	.start		= perf_swevent_int,
+	.stop		= perf_swevent_void,
+	.read		= perf_swevent_read,
+	.unthrottle	= perf_swevent_void,
+};
+
+static int perf_tp_filter_match(struct perf_event *event,
 				struct perf_sample_data *data)
 {
 	void *record = data->raw->data;
@@ -4221,13 +4610,55 @@ static int perf_tp_event_match(struct perf_event *event,
 	return 0;
 }
 
+static int perf_tp_event_match(struct perf_event *event,
+				struct perf_sample_data *data,
+				struct pt_regs *regs)
+{
+	/*
+	 * All tracepoints are from kernel-space.
+	 */
+	if (event->attr.exclude_kernel)
+		return 0;
+
+	if (!perf_tp_filter_match(event, data))
+		return 0;
+
+	return 1;
+}
+
+void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+		   struct pt_regs *regs, struct hlist_head *head, int rctx)
+{
+	struct perf_sample_data data;
+	struct perf_event *event;
+	struct hlist_node *node;
+
+	struct perf_raw_record raw = {
+		.size = entry_size,
+		.data = record,
+	};
+
+	perf_sample_data_init(&data, addr);
+	data.raw = &raw;
+
+	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+		if (perf_tp_event_match(event, &data, regs))
+			perf_swevent_add(event, count, 1, &data, regs);
+	}
+
+	perf_swevent_put_recursion_context(rctx);
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
+
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-	ftrace_profile_disable(event->attr.config);
+	perf_trace_destroy(event);
 }
 
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
+	int err;
+
 	/*
 	 * Raw tracepoint data is a severe data leak, only allow root to
 	 * have these.
@@ -4237,12 +4668,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
 			!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	if (ftrace_profile_enable(event->attr.config))
+	err = perf_trace_init(event);
+	if (err)
 		return NULL;
 
 	event->destroy = tp_perf_event_destroy;
 
-	return &perf_ops_generic;
+	return &perf_ops_tracepoint;
 }
 
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4270,12 +4702,6 @@ static void perf_event_free_filter(struct perf_event *event)
 
 #else
 
-static int perf_tp_event_match(struct perf_event *event,
-				struct perf_sample_data *data)
-{
-	return 1;
-}
-
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
 	return NULL;
@@ -4290,7 +4716,7 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
-#endif /* CONFIG_EVENT_PROFILE */
+#endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 static void bp_perf_event_destroy(struct perf_event *event)
@@ -4316,8 +4742,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
 	struct perf_sample_data sample;
 	struct pt_regs *regs = data;
 
-	sample.raw = NULL;
-	sample.addr = bp->attr.bp_addr;
+	perf_sample_data_init(&sample, bp->attr.bp_addr);
 
 	if (!perf_exclude_event(bp, regs))
 		perf_swevent_add(bp, 1, 1, &sample, regs);
@@ -4342,6 +4767,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
 	WARN_ON(event->parent);
 
 	atomic_dec(&perf_swevent_enabled[event_id]);
+	swevent_hlist_put(event);
 }
 
 static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4380,6 +4806,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
 	case PERF_COUNT_SW_ALIGNMENT_FAULTS:
 	case PERF_COUNT_SW_EMULATION_FAULTS:
 		if (!event->parent) {
+			int err;
+
+			err = swevent_hlist_get(event);
+			if (err)
+				return ERR_PTR(err);
+
 			atomic_inc(&perf_swevent_enabled[event_id]);
 			event->destroy = sw_perf_event_destroy;
 		}
@@ -4458,7 +4890,7 @@ perf_event_alloc(struct perf_event_attr *attr,
 		hwc->sample_period = 1;
 	hwc->last_period = hwc->sample_period;
 
-	atomic64_set(&hwc->period_left, hwc->sample_period);
+	local64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
 	 * we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -4507,7 +4939,7 @@ done:
 
 	if (!event->parent) {
 		atomic_inc(&nr_events);
-		if (event->attr.mmap)
+		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_inc(&nr_mmap_events);
 		if (event->attr.comm)
 			atomic_inc(&nr_comm_events);
@@ -4580,7 +5012,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 	if (attr->type >= PERF_TYPE_MAX)
 		return -EINVAL;
 
-	if (attr->__reserved_1 || attr->__reserved_2)
+	if (attr->__reserved_1)
 		return -EINVAL;
 
 	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4598,54 +5030,53 @@ err_size:
 	goto out;
 }
 
-static int perf_event_set_output(struct perf_event *event, int output_fd)
+static int
+perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
-	struct perf_event *output_event = NULL;
-	struct file *output_file = NULL;
-	struct perf_event *old_output;
-	int fput_needed = 0;
+	struct perf_buffer *buffer = NULL, *old_buffer = NULL;
 	int ret = -EINVAL;
 
-	if (!output_fd)
+	if (!output_event)
 		goto set;
 
-	output_file = fget_light(output_fd, &fput_needed);
-	if (!output_file)
-		return -EBADF;
-
-	if (output_file->f_op != &perf_fops)
+	/* don't allow circular references */
+	if (event == output_event)
 		goto out;
 
-	output_event = output_file->private_data;
-
-	/* Don't chain output fds */
-	if (output_event->output)
+	/*
+	 * Don't allow cross-cpu buffers
+	 */
+	if (output_event->cpu != event->cpu)
 		goto out;
 
-	/* Don't set an output fd when we already have an output channel */
-	if (event->data)
+	/*
+	 * If its not a per-cpu buffer, it must be the same task.
+	 */
+	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
 		goto out;
 
-	atomic_long_inc(&output_file->f_count);
-
 set:
 	mutex_lock(&event->mmap_mutex);
-	old_output = event->output;
-	rcu_assign_pointer(event->output, output_event);
-	mutex_unlock(&event->mmap_mutex);
+	/* Can't redirect output if we've got an active mmap() */
+	if (atomic_read(&event->mmap_count))
+		goto unlock;
 
-	if (old_output) {
-		/*
-		 * we need to make sure no existing perf_output_*()
-		 * is still referencing this event.
-		 */
-		synchronize_rcu();
-		fput(old_output->filp);
+	if (output_event) {
+		/* get the buffer we want to redirect to */
+		buffer = perf_buffer_get(output_event);
+		if (!buffer)
+			goto unlock;
 	}
 
+	old_buffer = event->buffer;
+	rcu_assign_pointer(event->buffer, buffer);
 	ret = 0;
+unlock:
+	mutex_unlock(&event->mmap_mutex);
+
+	if (old_buffer)
+		perf_buffer_put(old_buffer);
 out:
-	fput_light(output_file, fput_needed);
 	return ret;
 }
 
@@ -4661,13 +5092,13 @@ SYSCALL_DEFINE5(perf_event_open,
 		struct perf_event_attr __user *, attr_uptr,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
-	struct perf_event *event, *group_leader;
+	struct perf_event *event, *group_leader = NULL, *output_event = NULL;
 	struct perf_event_attr attr;
 	struct perf_event_context *ctx;
 	struct file *event_file = NULL;
 	struct file *group_file = NULL;
+	int event_fd;
 	int fput_needed = 0;
-	int fput_needed2 = 0;
 	int err;
 
 	/* for future expandability... */
@@ -4688,26 +5119,38 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	event_fd = get_unused_fd_flags(O_RDWR);
+	if (event_fd < 0)
+		return event_fd;
+
 	/*
 	 * Get the target context (task or percpu):
 	 */
 	ctx = find_get_context(pid, cpu);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
+	if (IS_ERR(ctx)) {
+		err = PTR_ERR(ctx);
+		goto err_fd;
+	}
+
+	if (group_fd != -1) {
+		group_leader = perf_fget_light(group_fd, &fput_needed);
+		if (IS_ERR(group_leader)) {
+			err = PTR_ERR(group_leader);
+			goto err_put_context;
+		}
+		group_file = group_leader->filp;
+		if (flags & PERF_FLAG_FD_OUTPUT)
+			output_event = group_leader;
+		if (flags & PERF_FLAG_FD_NO_GROUP)
+			group_leader = NULL;
+	}
 
 	/*
 	 * Look up the group leader (we will attach this event to it):
 	 */
-	group_leader = NULL;
-	if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+	if (group_leader) {
 		err = -EINVAL;
-		group_file = fget_light(group_fd, &fput_needed);
-		if (!group_file)
-			goto err_put_context;
-		if (group_file->f_op != &perf_fops)
-			goto err_put_context;
 
-		group_leader = group_file->private_data;
 		/*
 		 * Do not allow a recursive hierarchy (this new sibling
 		 * becoming part of another group-sibling):
@@ -4729,22 +5172,21 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	event = perf_event_alloc(&attr, cpu, ctx, group_leader,
 				     NULL, NULL, GFP_KERNEL);
-	err = PTR_ERR(event);
-	if (IS_ERR(event))
+	if (IS_ERR(event)) {
+		err = PTR_ERR(event);
 		goto err_put_context;
+	}
 
-	err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
-	if (err < 0)
-		goto err_free_put_context;
+	if (output_event) {
+		err = perf_event_set_output(event, output_event);
+		if (err)
+			goto err_free_put_context;
+	}
 
-	event_file = fget_light(err, &fput_needed2);
-	if (!event_file)
+	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
+	if (IS_ERR(event_file)) {
+		err = PTR_ERR(event_file);
 		goto err_free_put_context;
-
-	if (flags & PERF_FLAG_FD_OUTPUT) {
-		err = perf_event_set_output(event, group_fd);
-		if (err)
-			goto err_fput_free_put_context;
 	}
 
 	event->filp = event_file;
@@ -4760,19 +5202,23 @@ SYSCALL_DEFINE5(perf_event_open,
 	list_add_tail(&event->owner_entry, &current->perf_event_list);
 	mutex_unlock(&current->perf_event_mutex);
 
-err_fput_free_put_context:
-	fput_light(event_file, fput_needed2);
+	/*
+	 * Drop the reference on the group_event after placing the
+	 * new event on the sibling_list. This ensures destruction
+	 * of the group leader will find the pointer to itself in
+	 * perf_group_detach().
+	 */
+	fput_light(group_file, fput_needed);
+	fd_install(event_fd, event_file);
+	return event_fd;
 
 err_free_put_context:
-	if (err < 0)
-		kfree(event);
-
+	free_event(event);
 err_put_context:
-	if (err < 0)
-		put_ctx(ctx);
-
 	fput_light(group_file, fput_needed);
-
+	put_ctx(ctx);
+err_fd:
+	put_unused_fd(event_fd);
 	return err;
 }
 
@@ -4871,8 +5317,15 @@ inherit_event(struct perf_event *parent_event,
 	else
 		child_event->state = PERF_EVENT_STATE_OFF;
 
-	if (parent_event->attr.freq)
-		child_event->hw.sample_period = parent_event->hw.sample_period;
+	if (parent_event->attr.freq) {
+		u64 sample_period = parent_event->hw.sample_period;
+		struct hw_perf_event *hwc = &child_event->hw;
+
+		hwc->sample_period = sample_period;
+		hwc->last_period   = sample_period;
+
+		local64_set(&hwc->period_left, sample_period);
+	}
 
 	child_event->overflow_handler = parent_event->overflow_handler;
 
@@ -4932,12 +5385,12 @@ static void sync_child_event(struct perf_event *child_event,
 	if (child_event->attr.inherit_stat)
 		perf_event_read_event(child_event, child);
 
-	child_val = atomic64_read(&child_event->count);
+	child_val = perf_event_count(child_event);
 
 	/*
 	 * Add back the child's count to the parent's count:
 	 */
-	atomic64_add(child_val, &parent_event->count);
+	atomic64_add(child_val, &parent_event->child_count);
 	atomic64_add(child_event->total_time_enabled,
 		     &parent_event->child_total_time_enabled);
 	atomic64_add(child_event->total_time_running,
@@ -5037,10 +5490,14 @@ void perf_event_exit_task(struct task_struct *child)
 	 *
 	 * But since its the parent context it won't be the same instance.
 	 */
-	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+	mutex_lock(&child_ctx->mutex);
 
 again:
-	list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
+	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
+				 group_entry)
+		__perf_event_exit_task(child_event, child_ctx, child);
+
+	list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
 				 group_entry)
 		__perf_event_exit_task(child_event, child_ctx, child);
 
@@ -5049,7 +5506,8 @@ again:
 	 * its siblings to the list, but we obtained 'tmp' before that which
 	 * will still point to the list head terminating the iteration.
 	 */
-	if (!list_empty(&child_ctx->group_list))
+	if (!list_empty(&child_ctx->pinned_groups) ||
+	    !list_empty(&child_ctx->flexible_groups))
 		goto again;
 
 	mutex_unlock(&child_ctx->mutex);
@@ -5057,6 +5515,25 @@ again:
 	put_ctx(child_ctx);
 }
 
+static void perf_free_event(struct perf_event *event,
+			    struct perf_event_context *ctx)
+{
+	struct perf_event *parent = event->parent;
+
+	if (WARN_ON_ONCE(!parent))
+		return;
+
+	mutex_lock(&parent->child_mutex);
+	list_del_init(&event->child_list);
+	mutex_unlock(&parent->child_mutex);
+
+	fput(parent->filp);
+
+	perf_group_detach(event);
+	list_del_event(event, ctx);
+	free_event(event);
+}
+
 /*
  * free an unexposed, unused context as created by inheritance by
  * init_task below, used by fork() in case of fail.
@@ -5071,36 +5548,70 @@ void perf_event_free_task(struct task_struct *task)
 
 	mutex_lock(&ctx->mutex);
 again:
-	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
-		struct perf_event *parent = event->parent;
+	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
+		perf_free_event(event, ctx);
 
-		if (WARN_ON_ONCE(!parent))
-			continue;
+	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
+				 group_entry)
+		perf_free_event(event, ctx);
 
-		mutex_lock(&parent->child_mutex);
-		list_del_init(&event->child_list);
-		mutex_unlock(&parent->child_mutex);
+	if (!list_empty(&ctx->pinned_groups) ||
+	    !list_empty(&ctx->flexible_groups))
+		goto again;
 
-		fput(parent->filp);
+	mutex_unlock(&ctx->mutex);
 
-		list_del_event(event, ctx);
-		free_event(event);
+	put_ctx(ctx);
+}
+
+static int
+inherit_task_group(struct perf_event *event, struct task_struct *parent,
+		   struct perf_event_context *parent_ctx,
+		   struct task_struct *child,
+		   int *inherited_all)
+{
+	int ret;
+	struct perf_event_context *child_ctx = child->perf_event_ctxp;
+
+	if (!event->attr.inherit) {
+		*inherited_all = 0;
+		return 0;
 	}
 
-	if (!list_empty(&ctx->group_list))
-		goto again;
+	if (!child_ctx) {
+		/*
+		 * This is executed from the parent task context, so
+		 * inherit events that have been marked for cloning.
+		 * First allocate and initialize a context for the
+		 * child.
+		 */
 
-	mutex_unlock(&ctx->mutex);
+		child_ctx = kzalloc(sizeof(struct perf_event_context),
+				    GFP_KERNEL);
+		if (!child_ctx)
+			return -ENOMEM;
 
-	put_ctx(ctx);
+		__perf_event_init_context(child_ctx, child);
+		child->perf_event_ctxp = child_ctx;
+		get_task_struct(child);
+	}
+
+	ret = inherit_group(event, parent, parent_ctx,
+			    child, child_ctx);
+
+	if (ret)
+		*inherited_all = 0;
+
+	return ret;
 }
 
+
 /*
  * Initialize the perf_event context in task_struct
  */
 int perf_event_init_task(struct task_struct *child)
 {
-	struct perf_event_context *child_ctx = NULL, *parent_ctx;
+	struct perf_event_context *child_ctx, *parent_ctx;
 	struct perf_event_context *cloned_ctx;
 	struct perf_event *event;
 	struct task_struct *parent = current;
@@ -5138,41 +5649,22 @@ int perf_event_init_task(struct task_struct *child)
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
-	list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
-
-		if (!event->attr.inherit) {
-			inherited_all = 0;
-			continue;
-		}
-
-		if (!child->perf_event_ctxp) {
-			/*
-			 * This is executed from the parent task context, so
-			 * inherit events that have been marked for cloning.
-			 * First allocate and initialize a context for the
-			 * child.
-			 */
-
-			child_ctx = kzalloc(sizeof(struct perf_event_context),
-					    GFP_KERNEL);
-			if (!child_ctx) {
-				ret = -ENOMEM;
-				break;
-			}
-
-			__perf_event_init_context(child_ctx, child);
-			child->perf_event_ctxp = child_ctx;
-			get_task_struct(child);
-		}
+	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+		ret = inherit_task_group(event, parent, parent_ctx, child,
+					 &inherited_all);
+		if (ret)
+			break;
+	}
 
-		ret = inherit_group(event, parent, parent_ctx,
-					     child, child_ctx);
-		if (ret) {
-			inherited_all = 0;
+	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
+		ret = inherit_task_group(event, parent, parent_ctx, child,
+					 &inherited_all);
+		if (ret)
 			break;
-		}
 	}
 
+	child_ctx = child->perf_event_ctxp;
+
 	if (child_ctx && inherited_all) {
 		/*
 		 * Mark the child context as a clone of the parent
@@ -5200,18 +5692,37 @@ int perf_event_init_task(struct task_struct *child)
 	return ret;
 }
 
+static void __init perf_event_init_all_cpus(void)
+{
+	int cpu;
+	struct perf_cpu_context *cpuctx;
+
+	for_each_possible_cpu(cpu) {
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		mutex_init(&cpuctx->hlist_mutex);
+		__perf_event_init_context(&cpuctx->ctx, NULL);
+	}
+}
+
 static void __cpuinit perf_event_init_cpu(int cpu)
 {
 	struct perf_cpu_context *cpuctx;
 
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
-	__perf_event_init_context(&cpuctx->ctx, NULL);
 
 	spin_lock(&perf_resource_lock);
 	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
 	spin_unlock(&perf_resource_lock);
 
-	hw_perf_event_setup(cpu);
+	mutex_lock(&cpuctx->hlist_mutex);
+	if (cpuctx->hlist_refcount > 0) {
+		struct swevent_hlist *hlist;
+
+		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+		WARN_ON_ONCE(!hlist);
+		rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+	}
+	mutex_unlock(&cpuctx->hlist_mutex);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5221,7 +5732,9 @@ static void __perf_event_exit_cpu(void *info)
 	struct perf_event_context *ctx = &cpuctx->ctx;
 	struct perf_event *event, *tmp;
 
-	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
+	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
+		__perf_event_remove_from_context(event);
+	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
 		__perf_event_remove_from_context(event);
 }
 static void perf_event_exit_cpu(int cpu)
@@ -5229,6 +5742,10 @@ static void perf_event_exit_cpu(int cpu)
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
+	mutex_lock(&cpuctx->hlist_mutex);
+	swevent_hlist_release(cpuctx);
+	mutex_unlock(&cpuctx->hlist_mutex);
+
 	mutex_lock(&ctx->mutex);
 	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
 	mutex_unlock(&ctx->mutex);
@@ -5242,20 +5759,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (long)hcpu;
 
-	switch (action) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 
 	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
+	case CPU_DOWN_FAILED:
 		perf_event_init_cpu(cpu);
 		break;
 
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		hw_perf_event_setup_online(cpu);
-		break;
-
+	case CPU_UP_CANCELED:
 	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
 		perf_event_exit_cpu(cpu);
 		break;
 
@@ -5276,6 +5788,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
 
 void __init perf_event_init(void)
 {
+	perf_event_init_all_cpus();
 	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
 	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
@@ -5283,13 +5796,16 @@ void __init perf_event_init(void)
 	register_cpu_notifier(&perf_cpu_nb);
 }
 
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
+					struct sysdev_class_attribute *attr,
+					char *buf)
 {
 	return sprintf(buf, "%d\n", perf_reserved_percpu);
 }
 
 static ssize_t
 perf_set_reserve_percpu(struct sysdev_class *class,
+			struct sysdev_class_attribute *attr,
 			const char *buf,
 			size_t count)
 {
@@ -5318,13 +5834,17 @@ perf_set_reserve_percpu(struct sysdev_class *class,
 	return count;
 }
 
-static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+static ssize_t perf_show_overcommit(struct sysdev_class *class,
+				    struct sysdev_class_attribute *attr,
+				    char *buf)
 {
 	return sprintf(buf, "%d\n", perf_overcommit);
 }
 
 static ssize_t
-perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+perf_set_overcommit(struct sysdev_class *class,
+		    struct sysdev_class_attribute *attr,
+		    const char *buf, size_t count)
 {
 	unsigned long val;
 	int err;
diff --git a/kernel/pid.c b/kernel/pid.c
index 2e17c9c92cb..d55c6fb8d08 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -122,6 +122,43 @@ static void free_pidmap(struct upid *upid)
 	atomic_inc(&map->nr_free);
 }
 
+/*
+ * If we started walking pids at 'base', is 'a' seen before 'b'?
+ */
+static int pid_before(int base, int a, int b)
+{
+	/*
+	 * This is the same as saying
+	 *
+	 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
+	 * and that mapping orders 'a' and 'b' with respect to 'base'.
+	 */
+	return (unsigned)(a - base) < (unsigned)(b - base);
+}
+
+/*
+ * We might be racing with someone else trying to set pid_ns->last_pid.
+ * We want the winner to have the "later" value, because if the
+ * "earlier" value prevails, then a pid may get reused immediately.
+ *
+ * Since pids rollover, it is not sufficient to just pick the bigger
+ * value.  We have to consider where we started counting from.
+ *
+ * 'base' is the value of pid_ns->last_pid that we observed when
+ * we started looking for a pid.
+ *
+ * 'pid' is the pid that we eventually found.
+ */
+static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
+{
+	int prev;
+	int last_write = base;
+	do {
+		prev = last_write;
+		last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
+	} while ((prev != last_write) && (pid_before(base, last_write, pid)));
+}
+
 static int alloc_pidmap(struct pid_namespace *pid_ns)
 {
 	int i, offset, max_scan, pid, last = pid_ns->last_pid;
@@ -132,7 +169,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 		pid = RESERVED_PIDS;
 	offset = pid & BITS_PER_PAGE_MASK;
 	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
-	max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
+	/*
+	 * If last_pid points into the middle of the map->page we
+	 * want to scan this bitmap block twice, the second time
+	 * we start with offset == 0 (or RESERVED_PIDS).
+	 */
+	max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
 	for (i = 0; i <= max_scan; ++i) {
 		if (unlikely(!map->page)) {
 			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
@@ -154,20 +196,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 			do {
 				if (!test_and_set_bit(offset, map->page)) {
 					atomic_dec(&map->nr_free);
-					pid_ns->last_pid = pid;
+					set_last_pid(pid_ns, last, pid);
 					return pid;
 				}
 				offset = find_next_offset(map, offset);
 				pid = mk_pid(pid_ns, map, offset);
-			/*
-			 * find_next_offset() found a bit, the pid from it
-			 * is in-bounds, and if we fell back to the last
-			 * bitmap block and the final block was the same
-			 * as the starting point, pid is before last_pid.
-			 */
-			} while (offset < BITS_PER_PAGE && pid < pid_max &&
-					(i != max_scan || pid < last ||
-					    !((last+1) & BITS_PER_PAGE_MASK)));
+			} while (offset < BITS_PER_PAGE && pid < pid_max);
 		}
 		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
 			++map;
@@ -367,7 +401,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 	struct task_struct *result = NULL;
 	if (pid) {
 		struct hlist_node *first;
-		first = rcu_dereference(pid->tasks[type].first);
+		first = rcu_dereference_check(pid->tasks[type].first,
+					      rcu_read_lock_held() ||
+					      lockdep_tasklist_lock_is_held());
 		if (first)
 			result = hlist_entry(first, struct task_struct, pids[(type)].node);
 	}
@@ -376,7 +412,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 EXPORT_SYMBOL(pid_task);
 
 /*
- * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ * Must be called under rcu_read_lock().
  */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
@@ -511,6 +547,13 @@ void __init pidhash_init(void)
 
 void __init pidmap_init(void)
 {
+	/* bump default and minimum pid_max based on number of cpus */
+	pid_max = min(pid_max_max, max_t(int, pid_max,
+				PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+	pid_max_min = max_t(int, pid_max_min,
+				PIDS_PER_CPU_MIN * num_possible_cpus());
+	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+
 	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	/* Reserve PID 0. We never call free_pidmap(0) */
 	set_bit(0, init_pid_ns.pidmap[0].page);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b043..a5aff94e1f0 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -13,6 +13,7 @@
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
+#include <linux/slab.h>
 
 #define BITS_PER_PAGE		(PAGE_SIZE*8)
 
@@ -161,13 +162,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 		rcu_read_lock();
 
 		/*
-		 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
-		 * any nested-container's init processes don't ignore the
-		 * signal
+		 * Any nested-container's init processes won't ignore the
+		 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
 		 */
 		task = pid_task(find_vpid(nr), PIDTYPE_PID);
 		if (task)
-			force_sig(SIGKILL, task);
+			send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
 
 		rcu_read_unlock();
 
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 3db49b9ca37..645e541a45f 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -2,7 +2,7 @@
  * This module exposes the interface to kernel space for specifying
  * QoS dependencies.  It provides infrastructure for registration of:
  *
- * Dependents on a QoS value : register requirements
+ * Dependents on a QoS value : register requests
  * Watchers of QoS value : get notified when target QoS value changes
  *
  * This QoS design is best effort based.  Dependents register their QoS needs.
@@ -14,19 +14,21 @@
  * timeout: usec <-- currently not used.
  * throughput: kbs (kilo byte / sec)
  *
- * There are lists of pm_qos_objects each one wrapping requirements, notifiers
+ * There are lists of pm_qos_objects each one wrapping requests, notifiers
  *
- * User mode requirements on a QOS parameter register themselves to the
+ * User mode requests on a QOS parameter register themselves to the
  * subsystem by opening the device node /dev/... and writing there request to
  * the node.  As long as the process holds a file handle open to the node the
  * client continues to be accounted for.  Upon file release the usermode
- * requirement is removed and a new qos target is computed.  This way when the
- * requirement that the application has is cleaned up when closes the file
+ * request is removed and a new qos target is computed.  This way when the
+ * request that the application has is cleaned up when closes the file
  * pointer or exits the pm_qos_object will get an opportunity to clean up.
  *
  * Mark Gross <mgross@linux.intel.com>
  */
 
+/*#define DEBUG*/
+
 #include <linux/pm_qos_params.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
@@ -42,64 +44,53 @@
 #include <linux/uaccess.h>
 
 /*
- * locking rule: all changes to requirements or notifiers lists
+ * locking rule: all changes to requests or notifiers lists
  * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
  * held, taken with _irqsave.  One lock to rule them all
  */
-struct requirement_list {
-	struct list_head list;
-	union {
-		s32 value;
-		s32 usec;
-		s32 kbps;
-	};
-	char *name;
+enum pm_qos_type {
+	PM_QOS_MAX,		/* return the largest value */
+	PM_QOS_MIN		/* return the smallest value */
 };
 
-static s32 max_compare(s32 v1, s32 v2);
-static s32 min_compare(s32 v1, s32 v2);
-
 struct pm_qos_object {
-	struct requirement_list requirements;
+	struct plist_head requests;
 	struct blocking_notifier_head *notifiers;
 	struct miscdevice pm_qos_power_miscdev;
 	char *name;
 	s32 default_value;
-	atomic_t target_value;
-	s32 (*comparitor)(s32, s32);
+	enum pm_qos_type type;
 };
 
+static DEFINE_SPINLOCK(pm_qos_lock);
+
 static struct pm_qos_object null_pm_qos;
 static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
 static struct pm_qos_object cpu_dma_pm_qos = {
-	.requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)},
+	.requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
 	.notifiers = &cpu_dma_lat_notifier,
 	.name = "cpu_dma_latency",
 	.default_value = 2000 * USEC_PER_SEC,
-	.target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
-	.comparitor = min_compare
+	.type = PM_QOS_MIN,
 };
 
 static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
 static struct pm_qos_object network_lat_pm_qos = {
-	.requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)},
+	.requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
 	.notifiers = &network_lat_notifier,
 	.name = "network_latency",
 	.default_value = 2000 * USEC_PER_SEC,
-	.target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
-	.comparitor = min_compare
+	.type = PM_QOS_MIN
 };
 
 
 static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
 static struct pm_qos_object network_throughput_pm_qos = {
-	.requirements =
-		{LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
+	.requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
 	.notifiers = &network_throughput_notifier,
 	.name = "network_throughput",
 	.default_value = 0,
-	.target_value = ATOMIC_INIT(0),
-	.comparitor = max_compare
+	.type = PM_QOS_MAX,
 };
 
 
@@ -110,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = {
 	&network_throughput_pm_qos
 };
 
-static DEFINE_SPINLOCK(pm_qos_lock);
-
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 		size_t count, loff_t *f_pos);
 static int pm_qos_power_open(struct inode *inode, struct file *filp);
@@ -123,43 +112,55 @@ static const struct file_operations pm_qos_power_fops = {
 	.release = pm_qos_power_release,
 };
 
-/* static helper functions */
-static s32 max_compare(s32 v1, s32 v2)
+/* unlocked internal variant */
+static inline int pm_qos_get_value(struct pm_qos_object *o)
 {
-	return max(v1, v2);
-}
+	if (plist_head_empty(&o->requests))
+		return o->default_value;
 
-static s32 min_compare(s32 v1, s32 v2)
-{
-	return min(v1, v2);
-}
+	switch (o->type) {
+	case PM_QOS_MIN:
+		return plist_last(&o->requests)->prio;
 
+	case PM_QOS_MAX:
+		return plist_first(&o->requests)->prio;
 
-static void update_target(int target)
+	default:
+		/* runtime check for not using enum */
+		BUG();
+	}
+}
+
+static void update_target(struct pm_qos_object *o, struct plist_node *node,
+			  int del, int value)
 {
-	s32 extreme_value;
-	struct requirement_list *node;
 	unsigned long flags;
-	int call_notifier = 0;
+	int prev_value, curr_value;
 
 	spin_lock_irqsave(&pm_qos_lock, flags);
-	extreme_value = pm_qos_array[target]->default_value;
-	list_for_each_entry(node,
-			&pm_qos_array[target]->requirements.list, list) {
-		extreme_value = pm_qos_array[target]->comparitor(
-				extreme_value, node->value);
-	}
-	if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
-		call_notifier = 1;
-		atomic_set(&pm_qos_array[target]->target_value, extreme_value);
-		pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
-			atomic_read(&pm_qos_array[target]->target_value));
+	prev_value = pm_qos_get_value(o);
+	/* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
+	if (value != PM_QOS_DEFAULT_VALUE) {
+		/*
+		 * to change the list, we atomically remove, reinit
+		 * with new value and add, then see if the extremal
+		 * changed
+		 */
+		plist_del(node, &o->requests);
+		plist_node_init(node, value);
+		plist_add(node, &o->requests);
+	} else if (del) {
+		plist_del(node, &o->requests);
+	} else {
+		plist_add(node, &o->requests);
 	}
+	curr_value = pm_qos_get_value(o);
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
 
-	if (call_notifier)
-		blocking_notifier_call_chain(pm_qos_array[target]->notifiers,
-			(unsigned long) extreme_value, NULL);
+	if (prev_value != curr_value)
+		blocking_notifier_call_chain(o->notifiers,
+					     (unsigned long)curr_value,
+					     NULL);
 }
 
 static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -185,125 +186,125 @@ static int find_pm_qos_object_by_minor(int minor)
 }
 
 /**
- * pm_qos_requirement - returns current system wide qos expectation
+ * pm_qos_request - returns current system wide qos expectation
  * @pm_qos_class: identification of which qos value is requested
  *
  * This function returns the current target value in an atomic manner.
  */
-int pm_qos_requirement(int pm_qos_class)
+int pm_qos_request(int pm_qos_class)
 {
-	return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
+	unsigned long flags;
+	int value;
+
+	spin_lock_irqsave(&pm_qos_lock, flags);
+	value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
+	spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+	return value;
 }
-EXPORT_SYMBOL_GPL(pm_qos_requirement);
+EXPORT_SYMBOL_GPL(pm_qos_request);
+
+int pm_qos_request_active(struct pm_qos_request_list *req)
+{
+	return req->pm_qos_class != 0;
+}
+EXPORT_SYMBOL_GPL(pm_qos_request_active);
 
 /**
- * pm_qos_add_requirement - inserts new qos request into the list
- * @pm_qos_class: identifies which list of qos request to us
- * @name: identifies the request
+ * pm_qos_add_request - inserts new qos request into the list
+ * @dep: pointer to a preallocated handle
+ * @pm_qos_class: identifies which list of qos request to use
  * @value: defines the qos request
  *
  * This function inserts a new entry in the pm_qos_class list of requested qos
  * performance characteristics.  It recomputes the aggregate QoS expectations
- * for the pm_qos_class of parameters.
+ * for the pm_qos_class of parameters and initializes the pm_qos_request_list
+ * handle.  Caller needs to save this handle for later use in updates and
+ * removal.
  */
-int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
+
+void pm_qos_add_request(struct pm_qos_request_list *dep,
+			int pm_qos_class, s32 value)
 {
-	struct requirement_list *dep;
-	unsigned long flags;
+	struct pm_qos_object *o =  pm_qos_array[pm_qos_class];
+	int new_value;
 
-	dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL);
-	if (dep) {
-		if (value == PM_QOS_DEFAULT_VALUE)
-			dep->value = pm_qos_array[pm_qos_class]->default_value;
-		else
-			dep->value = value;
-		dep->name = kstrdup(name, GFP_KERNEL);
-		if (!dep->name)
-			goto cleanup;
-
-		spin_lock_irqsave(&pm_qos_lock, flags);
-		list_add(&dep->list,
-			&pm_qos_array[pm_qos_class]->requirements.list);
-		spin_unlock_irqrestore(&pm_qos_lock, flags);
-		update_target(pm_qos_class);
-
-		return 0;
+	if (pm_qos_request_active(dep)) {
+		WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
+		return;
 	}
-
-cleanup:
-	kfree(dep);
-	return -ENOMEM;
+	if (value == PM_QOS_DEFAULT_VALUE)
+		new_value = o->default_value;
+	else
+		new_value = value;
+	plist_node_init(&dep->list, new_value);
+	dep->pm_qos_class = pm_qos_class;
+	update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
 }
-EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
+EXPORT_SYMBOL_GPL(pm_qos_add_request);
 
 /**
- * pm_qos_update_requirement - modifies an existing qos request
- * @pm_qos_class: identifies which list of qos request to us
- * @name: identifies the request
+ * pm_qos_update_request - modifies an existing qos request
+ * @pm_qos_req : handle to list element holding a pm_qos request to use
  * @value: defines the qos request
  *
- * Updates an existing qos requirement for the pm_qos_class of parameters along
+ * Updates an existing qos request for the pm_qos_class of parameters along
  * with updating the target pm_qos_class value.
  *
- * If the named request isn't in the list then no change is made.
+ * Attempts are made to make this code callable on hot code paths.
  */
-int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
+void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
+			   s32 new_value)
 {
-	unsigned long flags;
-	struct requirement_list *node;
-	int pending_update = 0;
+	s32 temp;
+	struct pm_qos_object *o;
 
-	spin_lock_irqsave(&pm_qos_lock, flags);
-	list_for_each_entry(node,
-		&pm_qos_array[pm_qos_class]->requirements.list, list) {
-		if (strcmp(node->name, name) == 0) {
-			if (new_value == PM_QOS_DEFAULT_VALUE)
-				node->value =
-				pm_qos_array[pm_qos_class]->default_value;
-			else
-				node->value = new_value;
-			pending_update = 1;
-			break;
-		}
+	if (!pm_qos_req) /*guard against callers passing in null */
+		return;
+
+	if (!pm_qos_request_active(pm_qos_req)) {
+		WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
+		return;
 	}
-	spin_unlock_irqrestore(&pm_qos_lock, flags);
-	if (pending_update)
-		update_target(pm_qos_class);
 
-	return 0;
+	o = pm_qos_array[pm_qos_req->pm_qos_class];
+
+	if (new_value == PM_QOS_DEFAULT_VALUE)
+		temp = o->default_value;
+	else
+		temp = new_value;
+
+	if (temp != pm_qos_req->list.prio)
+		update_target(o, &pm_qos_req->list, 0, temp);
 }
-EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
+EXPORT_SYMBOL_GPL(pm_qos_update_request);
 
 /**
- * pm_qos_remove_requirement - modifies an existing qos request
- * @pm_qos_class: identifies which list of qos request to us
- * @name: identifies the request
+ * pm_qos_remove_request - modifies an existing qos request
+ * @pm_qos_req: handle to request list element
  *
- * Will remove named qos request from pm_qos_class list of parameters and
- * recompute the current target value for the pm_qos_class.
+ * Will remove pm qos request from the list of requests and
+ * recompute the current target value for the pm_qos_class.  Call this
+ * on slow code paths.
  */
-void pm_qos_remove_requirement(int pm_qos_class, char *name)
+void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
 {
-	unsigned long flags;
-	struct requirement_list *node;
-	int pending_update = 0;
+	struct pm_qos_object *o;
 
-	spin_lock_irqsave(&pm_qos_lock, flags);
-	list_for_each_entry(node,
-		&pm_qos_array[pm_qos_class]->requirements.list, list) {
-		if (strcmp(node->name, name) == 0) {
-			kfree(node->name);
-			list_del(&node->list);
-			kfree(node);
-			pending_update = 1;
-			break;
-		}
+	if (pm_qos_req == NULL)
+		return;
+		/* silent return to keep pcm code cleaner */
+
+	if (!pm_qos_request_active(pm_qos_req)) {
+		WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
+		return;
 	}
-	spin_unlock_irqrestore(&pm_qos_lock, flags);
-	if (pending_update)
-		update_target(pm_qos_class);
+
+	o = pm_qos_array[pm_qos_req->pm_qos_class];
+	update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
+	memset(pm_qos_req, 0, sizeof(*pm_qos_req));
 }
-EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
+EXPORT_SYMBOL_GPL(pm_qos_remove_request);
 
 /**
  * pm_qos_add_notifier - sets notification entry for changes to target value
@@ -313,7 +314,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
  * will register the notifier into a notification chain that gets called
  * upon changes to the pm_qos_class target value.
  */
- int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
+int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
 	int retval;
 
@@ -343,21 +344,20 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
 
-#define PID_NAME_LEN 32
-
 static int pm_qos_power_open(struct inode *inode, struct file *filp)
 {
-	int ret;
 	long pm_qos_class;
-	char name[PID_NAME_LEN];
 
 	pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
 	if (pm_qos_class >= 0) {
-		filp->private_data = (void *)pm_qos_class;
-		snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
-		ret = pm_qos_add_requirement(pm_qos_class, name,
-					PM_QOS_DEFAULT_VALUE);
-		if (ret >= 0)
+               struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
+		if (!req)
+			return -ENOMEM;
+
+		pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
+		filp->private_data = req;
+
+		if (filp->private_data)
 			return 0;
 	}
 	return -EPERM;
@@ -365,32 +365,43 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
 
 static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
-	int pm_qos_class;
-	char name[PID_NAME_LEN];
+	struct pm_qos_request_list *req;
 
-	pm_qos_class = (long)filp->private_data;
-	snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
-	pm_qos_remove_requirement(pm_qos_class, name);
+	req = filp->private_data;
+	pm_qos_remove_request(req);
+	kfree(req);
 
 	return 0;
 }
 
+
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 		size_t count, loff_t *f_pos)
 {
 	s32 value;
-	int pm_qos_class;
-	char name[PID_NAME_LEN];
-
-	pm_qos_class = (long)filp->private_data;
-	if (count != sizeof(s32))
+	int x;
+	char ascii_value[11];
+	struct pm_qos_request_list *pm_qos_req;
+
+	if (count == sizeof(s32)) {
+		if (copy_from_user(&value, buf, sizeof(s32)))
+			return -EFAULT;
+	} else if (count == 11) { /* len('0x12345678/0') */
+		if (copy_from_user(ascii_value, buf, 11))
+			return -EFAULT;
+		if (strlen(ascii_value) != 10)
+			return -EINVAL;
+		x = sscanf(ascii_value, "%x", &value);
+		if (x != 1)
+			return -EINVAL;
+		pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
+	} else
 		return -EINVAL;
-	if (copy_from_user(&value, buf, sizeof(s32)))
-		return -EFAULT;
-	snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
-	pm_qos_update_requirement(pm_qos_class, name, value);
 
-	return  sizeof(s32);
+	pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
+	pm_qos_update_request(pm_qos_req, value);
+
+	return count;
 }
 
 
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff452351..6842eeba587 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -11,19 +11,18 @@
 #include <trace/events/timer.h>
 
 /*
- * Called after updating RLIMIT_CPU to set timer expiration if necessary.
+ * Called after updating RLIMIT_CPU to run cpu timer and update
+ * tsk->signal->cputime_expires expiration cache if necessary. Needs
+ * siglock protection since other code may update expiration cache as
+ * well.
  */
-void update_rlimit_cpu(unsigned long rlim_new)
+void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
 {
 	cputime_t cputime = secs_to_cputime(rlim_new);
-	struct signal_struct *const sig = current->signal;
 
-	if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
-	    cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
-		spin_lock_irq(&current->sighand->siglock);
-		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
-		spin_unlock_irq(&current->sighand->siglock);
-	}
+	spin_lock_irq(&task->sighand->siglock);
+	set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
+	spin_unlock_irq(&task->sighand->siglock);
 }
 
 static int check_clock(const clockid_t which_clock)
@@ -233,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 {
-	struct sighand_struct *sighand;
-	struct signal_struct *sig;
+	struct signal_struct *sig = tsk->signal;
 	struct task_struct *t;
 
-	*times = INIT_CPUTIME;
+	times->utime = sig->utime;
+	times->stime = sig->stime;
+	times->sum_exec_runtime = sig->sum_sched_runtime;
 
 	rcu_read_lock();
-	sighand = rcu_dereference(tsk->sighand);
-	if (!sighand)
+	/* make sure we can trust tsk->thread_group list */
+	if (!likely(pid_alive(tsk)))
 		goto out;
 
-	sig = tsk->signal;
-
 	t = tsk;
 	do {
 		times->utime = cputime_add(times->utime, t->utime);
 		times->stime = cputime_add(times->stime, t->stime);
 		times->sum_exec_runtime += t->se.sum_exec_runtime;
-
-		t = next_thread(t);
-	} while (t != tsk);
-
-	times->utime = cputime_add(times->utime, sig->utime);
-	times->stime = cputime_add(times->stime, sig->stime);
-	times->sum_exec_runtime += sig->sum_sched_runtime;
+	} while_each_thread(tsk, t);
 out:
 	rcu_read_unlock();
 }
@@ -364,7 +356,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 				}
 			} else {
 				read_lock(&tasklist_lock);
-				if (thread_group_leader(p) && p->signal) {
+				if (thread_group_leader(p) && p->sighand) {
 					error =
 					    cpu_clock_sample_group(which_clock,
 							           p, &rtn);
@@ -440,7 +432,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
 
 	if (likely(p != NULL)) {
 		read_lock(&tasklist_lock);
-		if (unlikely(p->signal == NULL)) {
+		if (unlikely(p->sighand == NULL)) {
 			/*
 			 * We raced with the reaping of the task.
 			 * The deletion should have cleared us off the list.
@@ -548,111 +540,62 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
 	       cputime_gt(expires, new_exp);
 }
 
-static inline int expires_le(cputime_t expires, cputime_t new_exp)
-{
-	return !cputime_eq(expires, cputime_zero) &&
-	       cputime_le(expires, new_exp);
-}
 /*
  * Insert the timer on the appropriate list before any timers that
  * expire later.  This must be called with the tasklist_lock held
- * for reading, and interrupts disabled.
+ * for reading, interrupts disabled and p->sighand->siglock taken.
  */
-static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
+static void arm_timer(struct k_itimer *timer)
 {
 	struct task_struct *p = timer->it.cpu.task;
 	struct list_head *head, *listpos;
+	struct task_cputime *cputime_expires;
 	struct cpu_timer_list *const nt = &timer->it.cpu;
 	struct cpu_timer_list *next;
-	unsigned long i;
 
-	head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
-		p->cpu_timers : p->signal->cpu_timers);
+	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+		head = p->cpu_timers;
+		cputime_expires = &p->cputime_expires;
+	} else {
+		head = p->signal->cpu_timers;
+		cputime_expires = &p->signal->cputime_expires;
+	}
 	head += CPUCLOCK_WHICH(timer->it_clock);
 
-	BUG_ON(!irqs_disabled());
-	spin_lock(&p->sighand->siglock);
-
 	listpos = head;
-	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
-		list_for_each_entry(next, head, entry) {
-			if (next->expires.sched > nt->expires.sched)
-				break;
-			listpos = &next->entry;
-		}
-	} else {
-		list_for_each_entry(next, head, entry) {
-			if (cputime_gt(next->expires.cpu, nt->expires.cpu))
-				break;
-			listpos = &next->entry;
-		}
+	list_for_each_entry(next, head, entry) {
+		if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
+			break;
+		listpos = &next->entry;
 	}
 	list_add(&nt->entry, listpos);
 
 	if (listpos == head) {
+		union cpu_time_count *exp = &nt->expires;
+
 		/*
-		 * We are the new earliest-expiring timer.
-		 * If we are a thread timer, there can always
-		 * be a process timer telling us to stop earlier.
+		 * We are the new earliest-expiring POSIX 1.b timer, hence
+		 * need to update expiration cache. Take into account that
+		 * for process timers we share expiration cache with itimers
+		 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
 		 */
 
-		if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
-			union cpu_time_count *exp = &nt->expires;
-
-			switch (CPUCLOCK_WHICH(timer->it_clock)) {
-			default:
-				BUG();
-			case CPUCLOCK_PROF:
-				if (expires_gt(p->cputime_expires.prof_exp,
-					       exp->cpu))
-					p->cputime_expires.prof_exp = exp->cpu;
-				break;
-			case CPUCLOCK_VIRT:
-				if (expires_gt(p->cputime_expires.virt_exp,
-					       exp->cpu))
-					p->cputime_expires.virt_exp = exp->cpu;
-				break;
-			case CPUCLOCK_SCHED:
-				if (p->cputime_expires.sched_exp == 0 ||
-				    p->cputime_expires.sched_exp > exp->sched)
-					p->cputime_expires.sched_exp =
-								exp->sched;
-				break;
-			}
-		} else {
-			struct signal_struct *const sig = p->signal;
-			union cpu_time_count *exp = &timer->it.cpu.expires;
-
-			/*
-			 * For a process timer, set the cached expiration time.
-			 */
-			switch (CPUCLOCK_WHICH(timer->it_clock)) {
-			default:
-				BUG();
-			case CPUCLOCK_VIRT:
-				if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
-					       exp->cpu))
-					break;
-				sig->cputime_expires.virt_exp = exp->cpu;
-				break;
-			case CPUCLOCK_PROF:
-				if (expires_le(sig->it[CPUCLOCK_PROF].expires,
-					       exp->cpu))
-					break;
-				i = sig->rlim[RLIMIT_CPU].rlim_cur;
-				if (i != RLIM_INFINITY &&
-				    i <= cputime_to_secs(exp->cpu))
-					break;
-				sig->cputime_expires.prof_exp = exp->cpu;
-				break;
-			case CPUCLOCK_SCHED:
-				sig->cputime_expires.sched_exp = exp->sched;
-				break;
-			}
+		switch (CPUCLOCK_WHICH(timer->it_clock)) {
+		case CPUCLOCK_PROF:
+			if (expires_gt(cputime_expires->prof_exp, exp->cpu))
+				cputime_expires->prof_exp = exp->cpu;
+			break;
+		case CPUCLOCK_VIRT:
+			if (expires_gt(cputime_expires->virt_exp, exp->cpu))
+				cputime_expires->virt_exp = exp->cpu;
+			break;
+		case CPUCLOCK_SCHED:
+			if (cputime_expires->sched_exp == 0 ||
+			    cputime_expires->sched_exp > exp->sched)
+				cputime_expires->sched_exp = exp->sched;
+			break;
 		}
 	}
-
-	spin_unlock(&p->sighand->siglock);
 }
 
 /*
@@ -660,7 +603,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
  */
 static void cpu_timer_fire(struct k_itimer *timer)
 {
-	if (unlikely(timer->sigq == NULL)) {
+	if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+		/*
+		 * User don't want any signal.
+		 */
+		timer->it.cpu.expires.sched = 0;
+	} else if (unlikely(timer->sigq == NULL)) {
 		/*
 		 * This a special case for clock_nanosleep,
 		 * not a normal timer from sys_timer_create.
@@ -721,7 +669,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 			struct itimerspec *new, struct itimerspec *old)
 {
 	struct task_struct *p = timer->it.cpu.task;
-	union cpu_time_count old_expires, new_expires, val;
+	union cpu_time_count old_expires, new_expires, old_incr, val;
 	int ret;
 
 	if (unlikely(p == NULL)) {
@@ -736,10 +684,10 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	read_lock(&tasklist_lock);
 	/*
 	 * We need the tasklist_lock to protect against reaping that
-	 * clears p->signal.  If p has just been reaped, we can no
+	 * clears p->sighand.  If p has just been reaped, we can no
 	 * longer get any information about it at all.
 	 */
-	if (unlikely(p->signal == NULL)) {
+	if (unlikely(p->sighand == NULL)) {
 		read_unlock(&tasklist_lock);
 		put_task_struct(p);
 		timer->it.cpu.task = NULL;
@@ -752,6 +700,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	BUG_ON(!irqs_disabled());
 
 	ret = 0;
+	old_incr = timer->it.cpu.incr;
 	spin_lock(&p->sighand->siglock);
 	old_expires = timer->it.cpu.expires;
 	if (unlikely(timer->it.cpu.firing)) {
@@ -759,7 +708,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		ret = TIMER_RETRY;
 	} else
 		list_del_init(&timer->it.cpu.entry);
-	spin_unlock(&p->sighand->siglock);
 
 	/*
 	 * We need to sample the current value to convert the new
@@ -813,6 +761,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		 * disable this firing since we are already reporting
 		 * it as an overrun (thanks to bump_cpu_timer above).
 		 */
+		spin_unlock(&p->sighand->siglock);
 		read_unlock(&tasklist_lock);
 		goto out;
 	}
@@ -828,11 +777,11 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	 */
 	timer->it.cpu.expires = new_expires;
 	if (new_expires.sched != 0 &&
-	    (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
 	    cpu_time_before(timer->it_clock, val, new_expires)) {
-		arm_timer(timer, val);
+		arm_timer(timer);
 	}
 
+	spin_unlock(&p->sighand->siglock);
 	read_unlock(&tasklist_lock);
 
 	/*
@@ -853,7 +802,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	timer->it_overrun = -1;
 
 	if (new_expires.sched != 0 &&
-	    (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
 	    !cpu_time_before(timer->it_clock, val, new_expires)) {
 		/*
 		 * The designated time already passed, so we notify
@@ -867,7 +815,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
  out:
 	if (old) {
 		sample_to_timespec(timer->it_clock,
-				   timer->it.cpu.incr, &old->it_interval);
+				   old_incr, &old->it_interval);
 	}
 	return ret;
 }
@@ -908,7 +856,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 		clear_dead = p->exit_state;
 	} else {
 		read_lock(&tasklist_lock);
-		if (unlikely(p->signal == NULL)) {
+		if (unlikely(p->sighand == NULL)) {
 			/*
 			 * The process has been reaped.
 			 * We can't even collect a sample any more.
@@ -927,25 +875,6 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 		read_unlock(&tasklist_lock);
 	}
 
-	if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
-		if (timer->it.cpu.incr.sched == 0 &&
-		    cpu_time_before(timer->it_clock,
-				    timer->it.cpu.expires, now)) {
-			/*
-			 * Do-nothing timer expired and has no reload,
-			 * so it's as if it was never set.
-			 */
-			timer->it.cpu.expires.sched = 0;
-			itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
-			return;
-		}
-		/*
-		 * Account for any expirations and reloads that should
-		 * have happened.
-		 */
-		bump_cpu_timer(timer, now);
-	}
-
 	if (unlikely(clear_dead)) {
 		/*
 		 * We've noticed that the thread is dead, but
@@ -982,6 +911,7 @@ static void check_thread_timers(struct task_struct *tsk,
 	int maxfire;
 	struct list_head *timers = tsk->cpu_timers;
 	struct signal_struct *const sig = tsk->signal;
+	unsigned long soft;
 
 	maxfire = 20;
 	tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1030,9 +960,10 @@ static void check_thread_timers(struct task_struct *tsk,
 	/*
 	 * Check for the special case thread timers.
 	 */
-	if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
-		unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
-		unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
+	soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+	if (soft != RLIM_INFINITY) {
+		unsigned long hard =
+			ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
 
 		if (hard != RLIM_INFINITY &&
 		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1043,14 +974,13 @@ static void check_thread_timers(struct task_struct *tsk,
 			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
 			return;
 		}
-		if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
+		if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the soft limit, send a SIGXCPU every second.
 			 */
-			if (sig->rlim[RLIMIT_RTTIME].rlim_cur
-			    < sig->rlim[RLIMIT_RTTIME].rlim_max) {
-				sig->rlim[RLIMIT_RTTIME].rlim_cur +=
-								USEC_PER_SEC;
+			if (soft < hard) {
+				soft += USEC_PER_SEC;
+				sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
 			}
 			printk(KERN_INFO
 				"RT Watchdog Timeout: %s[%d]\n",
@@ -1060,14 +990,11 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 }
 
-static void stop_process_timers(struct task_struct *tsk)
+static void stop_process_timers(struct signal_struct *sig)
 {
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = &sig->cputimer;
 	unsigned long flags;
 
-	if (!cputimer->running)
-		return;
-
 	spin_lock_irqsave(&cputimer->lock, flags);
 	cputimer->running = 0;
 	spin_unlock_irqrestore(&cputimer->lock, flags);
@@ -1107,6 +1034,23 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 	}
 }
 
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime:	The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero.  Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+	if (cputime_eq(cputime->utime, cputime_zero) &&
+	    cputime_eq(cputime->stime, cputime_zero) &&
+	    cputime->sum_exec_runtime == 0)
+		return 1;
+	return 0;
+}
+
 /*
  * Check for any per-thread CPU timers that have fired and move them
  * off the tsk->*_timers list onto the firing list.  Per-thread timers
@@ -1121,19 +1065,7 @@ static void check_process_timers(struct task_struct *tsk,
 	unsigned long long sum_sched_runtime, sched_expires;
 	struct list_head *timers = sig->cpu_timers;
 	struct task_cputime cputime;
-
-	/*
-	 * Don't sample the current process CPU clocks if there are no timers.
-	 */
-	if (list_empty(&timers[CPUCLOCK_PROF]) &&
-	    cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
-	    sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
-	    list_empty(&timers[CPUCLOCK_VIRT]) &&
-	    cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
-	    list_empty(&timers[CPUCLOCK_SCHED])) {
-		stop_process_timers(tsk);
-		return;
-	}
+	unsigned long soft;
 
 	/*
 	 * Collect the current process totals.
@@ -1193,11 +1125,13 @@ static void check_process_timers(struct task_struct *tsk,
 			 SIGPROF);
 	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
 			 SIGVTALRM);
-
-	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+	soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+	if (soft != RLIM_INFINITY) {
 		unsigned long psecs = cputime_to_secs(ptime);
+		unsigned long hard =
+			ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
 		cputime_t x;
-		if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) {
+		if (psecs >= hard) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
@@ -1205,35 +1139,28 @@ static void check_process_timers(struct task_struct *tsk,
 			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
 			return;
 		}
-		if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) {
+		if (psecs >= soft) {
 			/*
 			 * At the soft limit, send a SIGXCPU every second.
 			 */
 			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
-			if (sig->rlim[RLIMIT_CPU].rlim_cur
-			    < sig->rlim[RLIMIT_CPU].rlim_max) {
-				sig->rlim[RLIMIT_CPU].rlim_cur++;
+			if (soft < hard) {
+				soft++;
+				sig->rlim[RLIMIT_CPU].rlim_cur = soft;
 			}
 		}
-		x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+		x = secs_to_cputime(soft);
 		if (cputime_eq(prof_expires, cputime_zero) ||
 		    cputime_lt(x, prof_expires)) {
 			prof_expires = x;
 		}
 	}
 
-	if (!cputime_eq(prof_expires, cputime_zero) &&
-	    (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) ||
-	     cputime_gt(sig->cputime_expires.prof_exp, prof_expires)))
-		sig->cputime_expires.prof_exp = prof_expires;
-	if (!cputime_eq(virt_expires, cputime_zero) &&
-	    (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
-	     cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
-		sig->cputime_expires.virt_exp = virt_expires;
-	if (sched_expires != 0 &&
-	    (sig->cputime_expires.sched_exp == 0 ||
-	     sig->cputime_expires.sched_exp > sched_expires))
-		sig->cputime_expires.sched_exp = sched_expires;
+	sig->cputime_expires.prof_exp = prof_expires;
+	sig->cputime_expires.virt_exp = virt_expires;
+	sig->cputime_expires.sched_exp = sched_expires;
+	if (task_cputime_zero(&sig->cputime_expires))
+		stop_process_timers(sig);
 }
 
 /*
@@ -1262,9 +1189,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			goto out;
 		}
 		read_lock(&tasklist_lock); /* arm_timer needs it.  */
+		spin_lock(&p->sighand->siglock);
 	} else {
 		read_lock(&tasklist_lock);
-		if (unlikely(p->signal == NULL)) {
+		if (unlikely(p->sighand == NULL)) {
 			/*
 			 * The process has been reaped.
 			 * We can't even collect a sample any more.
@@ -1282,6 +1210,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			clear_dead_task(timer, now);
 			goto out_unlock;
 		}
+		spin_lock(&p->sighand->siglock);
 		cpu_timer_sample_group(timer->it_clock, p, &now);
 		bump_cpu_timer(timer, now);
 		/* Leave the tasklist_lock locked for the call below.  */
@@ -1290,7 +1219,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	/*
 	 * Now re-arm for the new expiry time.
 	 */
-	arm_timer(timer, now);
+	BUG_ON(!irqs_disabled());
+	arm_timer(timer);
+	spin_unlock(&p->sighand->siglock);
 
 out_unlock:
 	read_unlock(&tasklist_lock);
@@ -1302,23 +1233,6 @@ out:
 }
 
 /**
- * task_cputime_zero - Check a task_cputime struct for all zero fields.
- *
- * @cputime:	The struct to compare.
- *
- * Checks @cputime to see if all fields are zero.  Returns true if all fields
- * are zero, false if any field is nonzero.
- */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
-{
-	if (cputime_eq(cputime->utime, cputime_zero) &&
-	    cputime_eq(cputime->stime, cputime_zero) &&
-	    cputime->sum_exec_runtime == 0)
-		return 1;
-	return 0;
-}
-
-/**
  * task_cputime_expired - Compare two task_cputime entities.
  *
  * @sample:	The task_cputime structure to be checked for expiration.
@@ -1358,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 {
 	struct signal_struct *sig;
 
-	/* tsk == current, ensure it is safe to use ->signal/sighand */
-	if (unlikely(tsk->exit_state))
-		return 0;
-
 	if (!task_cputime_zero(&tsk->cputime_expires)) {
 		struct task_cputime task_sample = {
 			.utime = tsk->utime,
@@ -1374,15 +1284,18 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	}
 
 	sig = tsk->signal;
-	if (!task_cputime_zero(&sig->cputime_expires)) {
+	if (sig->cputimer.running) {
 		struct task_cputime group_sample;
 
-		thread_group_cputimer(tsk, &group_sample);
+		spin_lock(&sig->cputimer.lock);
+		group_sample = sig->cputimer.cputime;
+		spin_unlock(&sig->cputimer.lock);
+
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
 	}
 
-	return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;
+	return 0;
 }
 
 /*
@@ -1394,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 {
 	LIST_HEAD(firing);
 	struct k_itimer *timer, *next;
+	unsigned long flags;
 
 	BUG_ON(!irqs_disabled());
 
@@ -1404,14 +1318,20 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	if (!fastpath_timer_check(tsk))
 		return;
 
-	spin_lock(&tsk->sighand->siglock);
+	if (!lock_task_sighand(tsk, &flags))
+		return;
 	/*
 	 * Here we take off tsk->signal->cpu_timers[N] and
 	 * tsk->cpu_timers[N] all the timers that are firing, and
 	 * put them on the firing list.
 	 */
 	check_thread_timers(tsk, &firing);
-	check_process_timers(tsk, &firing);
+	/*
+	 * If there are any active process wide timers (POSIX 1.b, itimers,
+	 * RLIMIT_CPU) cputimer must be running.
+	 */
+	if (tsk->signal->cputimer.running)
+		check_process_timers(tsk, &firing);
 
 	/*
 	 * We must release these locks before taking any timer's lock.
@@ -1421,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	 * that gets the timer lock before we do will give it up and
 	 * spin until we've taken care of that timer below.
 	 */
-	spin_unlock(&tsk->sighand->siglock);
+	unlock_task_sighand(tsk, &flags);
 
 	/*
 	 * Now that all the timers on our list have the firing flag,
@@ -1448,21 +1368,23 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 }
 
 /*
- * Set one of the process-wide special case CPU timers.
+ * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
  * The tsk->sighand->siglock must be held by the caller.
- * The *newval argument is relative and we update it to be absolute, *oldval
- * is absolute and we update it to be relative.
  */
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval)
 {
 	union cpu_time_count now;
-	struct list_head *head;
 
 	BUG_ON(clock_idx == CPUCLOCK_SCHED);
 	cpu_timer_sample_group(clock_idx, tsk, &now);
 
 	if (oldval) {
+		/*
+		 * We are setting itimer. The *oldval is absolute and we update
+		 * it to be relative, *newval argument is relative and we update
+		 * it to be absolute.
+		 */
 		if (!cputime_eq(*oldval, cputime_zero)) {
 			if (cputime_le(*oldval, now.cpu)) {
 				/* Just about to fire. */
@@ -1475,33 +1397,21 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 		if (cputime_eq(*newval, cputime_zero))
 			return;
 		*newval = cputime_add(*newval, now.cpu);
-
-		/*
-		 * If the RLIMIT_CPU timer will expire before the
-		 * ITIMER_PROF timer, we have nothing else to do.
-		 */
-		if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
-		    < cputime_to_secs(*newval))
-			return;
 	}
 
 	/*
-	 * Check whether there are any process timers already set to fire
-	 * before this one.  If so, we don't have anything more to do.
+	 * Update expiration cache if we are the earliest timer, or eventually
+	 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
 	 */
-	head = &tsk->signal->cpu_timers[clock_idx];
-	if (list_empty(head) ||
-	    cputime_ge(list_first_entry(head,
-				  struct cpu_timer_list, entry)->expires.cpu,
-		       *newval)) {
-		switch (clock_idx) {
-		case CPUCLOCK_PROF:
+	switch (clock_idx) {
+	case CPUCLOCK_PROF:
+		if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
 			tsk->signal->cputime_expires.prof_exp = *newval;
-			break;
-		case CPUCLOCK_VIRT:
+		break;
+	case CPUCLOCK_VIRT:
+		if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
 			tsk->signal->cputime_expires.virt_exp = *newval;
-			break;
-		}
+		break;
 	}
 }
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 495440779ce..9ca4973f736 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
 	return 0;
 }
 
-int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
 {
 	*tp = ktime_to_timespec(KTIME_LOW_RES);
 	return 0;
@@ -559,19 +559,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 	new_timer->it_id = (timer_t) new_timer_id;
 	new_timer->it_clock = which_clock;
 	new_timer->it_overrun = -1;
-	error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
-	if (error)
-		goto out;
 
-	/*
-	 * return the timer_id now.  The next step is hard to
-	 * back out if there is an error.
-	 */
-	if (copy_to_user(created_timer_id,
-			 &new_timer_id, sizeof (new_timer_id))) {
-		error = -EFAULT;
-		goto out;
-	}
 	if (timer_event_spec) {
 		if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
 			error = -EFAULT;
@@ -597,6 +585,16 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 	new_timer->sigq->info.si_tid   = new_timer->it_id;
 	new_timer->sigq->info.si_code  = SI_TIMER;
 
+	if (copy_to_user(created_timer_id,
+			 &new_timer_id, sizeof (new_timer_id))) {
+		error = -EFAULT;
+		goto out;
+	}
+
+	error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+	if (error)
+		goto out;
+
 	spin_lock_irq(&current->sighand->siglock);
 	new_timer->it_signal = current->signal;
 	list_add(&new_timer->list, &current->signal->posix_timers);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb..ca6066a6952 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
 	code. This is helpful when debugging and reporting PM bugs, like
 	suspend support.
 
+config PM_ADVANCED_DEBUG
+	bool "Extra PM attributes in sysfs for low-level debugging/testing"
+	depends on PM_DEBUG
+	default n
+	---help---
+	Add extra sysfs attributes allowing one to access some Power Management
+	fields of device objects from user space.  If you are not a kernel
+	developer interested in debugging/testing Power Management, say "no".
+
 config PM_VERBOSE
 	bool "Verbose Power Management debugging"
 	depends on PM_DEBUG
@@ -85,9 +94,18 @@ config PM_SLEEP
 	depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
 	default y
 
+config PM_SLEEP_ADVANCED_DEBUG
+	bool
+	depends on PM_ADVANCED_DEBUG
+	default n
+
+config SUSPEND_NVS
+       bool
+
 config SUSPEND
 	bool "Suspend to RAM and standby"
 	depends on PM && ARCH_SUSPEND_POSSIBLE
+	select SUSPEND_NVS if HAS_IOMEM
 	default y
 	---help---
 	  Allow the system to enter sleep states in which main memory is
@@ -116,13 +134,10 @@ config SUSPEND_FREEZER
 
 	  Turning OFF this setting is NOT recommended! If in doubt, say Y.
 
-config HIBERNATION_NVS
-	bool
-
 config HIBERNATION
 	bool "Hibernation (aka 'suspend to disk')"
 	depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
-	select HIBERNATION_NVS if HAS_IOMEM
+	select SUSPEND_NVS if HAS_IOMEM
 	---help---
 	  Enable the suspend to disk (STD) functionality, which is usually
 	  called "hibernation" in user interfaces.  STD checkpoints the
@@ -222,3 +237,8 @@ config PM_RUNTIME
 	  and the bus type drivers of the buses the devices are on are
 	  responsible for the actual handling of the autosuspend requests and
 	  wake-up events.
+
+config PM_OPS
+	bool
+	depends on PM_SLEEP || PM_RUNTIME
+	default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 43191815f87..f9063c6b185 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP)		+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
-obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o
-obj-$(CONFIG_HIBERNATION_NVS)	+= hibernate_nvs.o
+obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o \
+				   block_io.o
+obj-$(CONFIG_SUSPEND_NVS)	+= nvs.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
new file mode 100644
index 00000000000..83bbc7c02df
--- /dev/null
+++ b/kernel/power/block_io.c
@@ -0,0 +1,103 @@
+/*
+ * This file provides functions for block I/O operations on swap/file.
+ *
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/bio.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+#include "power.h"
+
+/**
+ *	submit - submit BIO request.
+ *	@rw:	READ or WRITE.
+ *	@off	physical offset of page.
+ *	@page:	page we're reading or writing.
+ *	@bio_chain: list of pending biod (for async reading)
+ *
+ *	Straight from the textbook - allocate and initialize the bio.
+ *	If we're reading, make sure the page is marked as dirty.
+ *	Then submit it and, if @bio_chain == NULL, wait.
+ */
+static int submit(int rw, struct block_device *bdev, sector_t sector,
+		struct page *page, struct bio **bio_chain)
+{
+	const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
+	struct bio *bio;
+
+	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+	bio->bi_sector = sector;
+	bio->bi_bdev = bdev;
+	bio->bi_end_io = end_swap_bio_read;
+
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
+			(unsigned long long)sector);
+		bio_put(bio);
+		return -EFAULT;
+	}
+
+	lock_page(page);
+	bio_get(bio);
+
+	if (bio_chain == NULL) {
+		submit_bio(bio_rw, bio);
+		wait_on_page_locked(page);
+		if (rw == READ)
+			bio_set_pages_dirty(bio);
+		bio_put(bio);
+	} else {
+		if (rw == READ)
+			get_page(page);	/* These pages are freed later */
+		bio->bi_private = *bio_chain;
+		*bio_chain = bio;
+		submit_bio(bio_rw, bio);
+	}
+	return 0;
+}
+
+int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+	return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
+			virt_to_page(addr), bio_chain);
+}
+
+int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+	return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
+			virt_to_page(addr), bio_chain);
+}
+
+int hib_wait_on_bio_chain(struct bio **bio_chain)
+{
+	struct bio *bio;
+	struct bio *next_bio;
+	int ret = 0;
+
+	if (bio_chain == NULL)
+		return 0;
+
+	bio = *bio_chain;
+	if (bio == NULL)
+		return 0;
+	while (bio) {
+		struct page *page;
+
+		next_bio = bio->bi_private;
+		page = bio->bi_io_vec[0].bv_page;
+		wait_on_page_locked(page);
+		if (!PageUptodate(page) || PageError(page))
+			ret = -EIO;
+		put_page(page);
+		bio_put(bio);
+		bio = next_bio;
+	}
+	*bio_chain = NULL;
+	return ret;
+}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bbfe472d752..8dc31e02ae1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2003 Patrick Mochel
  * Copyright (c) 2003 Open Source Development Lab
- * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
  * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
  *
  * This file is released under the GPLv2.
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/gfp.h>
 #include <scsi/scsi_scan.h>
 #include <asm/suspend.h>
 
@@ -276,7 +277,7 @@ static int create_image(int platform_mode)
 		goto Enable_irqs;
 	}
 
-	if (hibernation_test(TEST_CORE))
+	if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
 		goto Power_up;
 
 	in_suspend = 1;
@@ -287,8 +288,10 @@ static int create_image(int platform_mode)
 			error);
 	/* Restore control flow magically appears here */
 	restore_processor_state();
-	if (!in_suspend)
+	if (!in_suspend) {
+		events_check_enabled = false;
 		platform_leave(platform_mode);
+	}
 
  Power_up:
 	sysdev_resume();
@@ -323,10 +326,11 @@ static int create_image(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
 	int error;
+	gfp_t saved_mask;
 
 	error = platform_begin(platform_mode);
 	if (error)
-		return error;
+		goto Close;
 
 	/* Preallocate image memory before shutting down devices. */
 	error = hibernate_preallocate_memory();
@@ -334,6 +338,7 @@ int hibernation_snapshot(int platform_mode)
 		goto Close;
 
 	suspend_console();
+	saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
 	error = dpm_suspend_start(PMSG_FREEZE);
 	if (error)
 		goto Recover_platform;
@@ -351,6 +356,7 @@ int hibernation_snapshot(int platform_mode)
 
 	dpm_resume_end(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+	set_gfp_allowed_mask(saved_mask);
 	resume_console();
  Close:
 	platform_end(platform_mode);
@@ -445,14 +451,17 @@ static int resume_target_kernel(bool platform_mode)
 int hibernation_restore(int platform_mode)
 {
 	int error;
+	gfp_t saved_mask;
 
 	pm_prepare_console();
 	suspend_console();
+	saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
 	error = dpm_suspend_start(PMSG_QUIESCE);
 	if (!error) {
 		error = resume_target_kernel(platform_mode);
 		dpm_resume_end(PMSG_RECOVER);
 	}
+	set_gfp_allowed_mask(saved_mask);
 	resume_console();
 	pm_restore_console();
 	return error;
@@ -466,6 +475,7 @@ int hibernation_restore(int platform_mode)
 int hibernation_platform_enter(void)
 {
 	int error;
+	gfp_t saved_mask;
 
 	if (!hibernation_ops)
 		return -ENOSYS;
@@ -481,6 +491,7 @@ int hibernation_platform_enter(void)
 
 	entering_platform_hibernation = true;
 	suspend_console();
+	saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
 	error = dpm_suspend_start(PMSG_HIBERNATE);
 	if (error) {
 		if (hibernation_ops->recover)
@@ -502,22 +513,29 @@ int hibernation_platform_enter(void)
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_HIBERNATE);
+	if (!pm_check_wakeup_events()) {
+		error = -EAGAIN;
+		goto Power_up;
+	}
+
 	hibernation_ops->enter();
 	/* We should never get here */
 	while (1);
 
-	/*
-	 * We don't need to reenable the nonboot CPUs or resume consoles, since
-	 * the system is going to be halted anyway.
-	 */
+ Power_up:
+	sysdev_resume();
+	local_irq_enable();
+	enable_nonboot_cpus();
+
  Platform_finish:
 	hibernation_ops->finish();
 
-	dpm_suspend_noirq(PMSG_RESTORE);
+	dpm_resume_noirq(PMSG_RESTORE);
 
  Resume_devices:
 	entering_platform_hibernation = false;
 	dpm_resume_end(PMSG_RESTORE);
+	set_gfp_allowed_mask(saved_mask);
 	resume_console();
 
  Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c713905..62b0bc6e498 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
 			== NOTIFY_BAD) ? -EINVAL : 0;
 }
 
+/* If set, devices may be suspended and resumed asynchronously. */
+int pm_async_enabled = 1;
+
+static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
+			     char *buf)
+{
+	return sprintf(buf, "%d\n", pm_async_enabled);
+}
+
+static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
+			      const char *buf, size_t n)
+{
+	unsigned long val;
+
+	if (strict_strtoul(buf, 10, &val))
+		return -EINVAL;
+
+	if (val > 1)
+		return -EINVAL;
+
+	pm_async_enabled = val;
+	return n;
+}
+
+power_attr(pm_async);
+
 #ifdef CONFIG_PM_DEBUG
 int pm_test_level = TEST_NONE;
 
@@ -178,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 
 power_attr(state);
 
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The 'wakeup_count' attribute, along with the functions defined in
+ * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
+ * handled in a non-racy way.
+ *
+ * If a wakeup event occurs when the system is in a sleep state, it simply is
+ * woken up.  In turn, if an event that would wake the system up from a sleep
+ * state occurs when it is undergoing a transition to that sleep state, the
+ * transition should be aborted.  Moreover, if such an event occurs when the
+ * system is in the working state, an attempt to start a transition to the
+ * given sleep state should fail during certain period after the detection of
+ * the event.  Using the 'state' attribute alone is not sufficient to satisfy
+ * these requirements, because a wakeup event may occur exactly when 'state'
+ * is being written to and may be delivered to user space right before it is
+ * frozen, so the event will remain only partially processed until the system is
+ * woken up by another event.  In particular, it won't cause the transition to
+ * a sleep state to be aborted.
+ *
+ * This difficulty may be overcome if user space uses 'wakeup_count' before
+ * writing to 'state'.  It first should read from 'wakeup_count' and store
+ * the read value.  Then, after carrying out its own preparations for the system
+ * transition to a sleep state, it should write the stored value to
+ * 'wakeup_count'.  If that fails, at least one wakeup event has occured since
+ * 'wakeup_count' was read and 'state' should not be written to.  Otherwise, it
+ * is allowed to write to 'state', but the transition will be aborted if there
+ * are any wakeup events detected after 'wakeup_count' was written to.
+ */
+
+static ssize_t wakeup_count_show(struct kobject *kobj,
+				struct kobj_attribute *attr,
+				char *buf)
+{
+	unsigned long val;
+
+	return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR;
+}
+
+static ssize_t wakeup_count_store(struct kobject *kobj,
+				struct kobj_attribute *attr,
+				const char *buf, size_t n)
+{
+	unsigned long val;
+
+	if (sscanf(buf, "%lu", &val) == 1) {
+		if (pm_save_wakeup_count(val))
+			return n;
+	}
+	return -EINVAL;
+}
+
+power_attr(wakeup_count);
+#endif /* CONFIG_PM_SLEEP */
+
 #ifdef CONFIG_PM_TRACE
 int pm_trace_enabled;
 
@@ -208,9 +288,13 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_TRACE
 	&pm_trace_attr.attr,
 #endif
-#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG)
+#ifdef CONFIG_PM_SLEEP
+	&pm_async_attr.attr,
+	&wakeup_count_attr.attr,
+#ifdef CONFIG_PM_DEBUG
 	&pm_test_attr.attr,
 #endif
+#endif
 	NULL,
 };
 
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/nvs.c
index 39ac698ef83..1836db60bbb 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/nvs.c
@@ -10,11 +10,12 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/suspend.h>
 
 /*
  * Platforms, like ACPI, may want us to save some memory used by them during
- * hibernation and to restore the contents of this memory during the subsequent
+ * suspend and to restore the contents of this memory during the subsequent
  * resume.  The code below implements a mechanism allowing us to do that.
  */
 
@@ -29,7 +30,7 @@ struct nvs_page {
 static LIST_HEAD(nvs_list);
 
 /**
- *	hibernate_nvs_register - register platform NVS memory region to save
+ *	suspend_nvs_register - register platform NVS memory region to save
  *	@start - physical address of the region
  *	@size - size of the region
  *
@@ -37,7 +38,7 @@ static LIST_HEAD(nvs_list);
  *	things so that the data from page-aligned addresses in this region will
  *	be copied into separate RAM pages.
  */
-int hibernate_nvs_register(unsigned long start, unsigned long size)
+int suspend_nvs_register(unsigned long start, unsigned long size)
 {
 	struct nvs_page *entry, *next;
 
@@ -67,9 +68,9 @@ int hibernate_nvs_register(unsigned long start, unsigned long size)
 }
 
 /**
- *	hibernate_nvs_free - free data pages allocated for saving NVS regions
+ *	suspend_nvs_free - free data pages allocated for saving NVS regions
  */
-void hibernate_nvs_free(void)
+void suspend_nvs_free(void)
 {
 	struct nvs_page *entry;
 
@@ -85,16 +86,16 @@ void hibernate_nvs_free(void)
 }
 
 /**
- *	hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
+ *	suspend_nvs_alloc - allocate memory necessary for saving NVS regions
  */
-int hibernate_nvs_alloc(void)
+int suspend_nvs_alloc(void)
 {
 	struct nvs_page *entry;
 
 	list_for_each_entry(entry, &nvs_list, node) {
 		entry->data = (void *)__get_free_page(GFP_KERNEL);
 		if (!entry->data) {
-			hibernate_nvs_free();
+			suspend_nvs_free();
 			return -ENOMEM;
 		}
 	}
@@ -102,9 +103,9 @@ int hibernate_nvs_alloc(void)
 }
 
 /**
- *	hibernate_nvs_save - save NVS memory regions
+ *	suspend_nvs_save - save NVS memory regions
  */
-void hibernate_nvs_save(void)
+void suspend_nvs_save(void)
 {
 	struct nvs_page *entry;
 
@@ -118,12 +119,12 @@ void hibernate_nvs_save(void)
 }
 
 /**
- *	hibernate_nvs_restore - restore NVS memory regions
+ *	suspend_nvs_restore - restore NVS memory regions
  *
  *	This function is going to be called with interrupts disabled, so it
  *	cannot iounmap the virtual addresses used to access the NVS region.
  */
-void hibernate_nvs_restore(void)
+void suspend_nvs_restore(void)
 {
 	struct nvs_page *entry;
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46c5a26630a..006270fe382 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void);
  */
 
 struct snapshot_handle {
-	loff_t		offset;	/* number of the last byte ready for reading
-				 * or writing in the sequence
-				 */
 	unsigned int	cur;	/* number of the block of PAGE_SIZE bytes the
 				 * next operation will refer to (ie. current)
 				 */
-	unsigned int	cur_offset;	/* offset with respect to the current
-					 * block (for the next operation)
-					 */
-	unsigned int	prev;	/* number of the block of PAGE_SIZE bytes that
-				 * was the current one previously
-				 */
 	void		*buffer;	/* address of the block to read from
 					 * or write to
 					 */
-	unsigned int	buf_offset;	/* location to read from or write to,
-					 * given as a displacement from 'buffer'
-					 */
 	int		sync_read;	/* Set to one to notify the caller of
 					 * snapshot_write_next() that it may
 					 * need to call wait_on_bio_chain()
@@ -125,12 +113,12 @@ struct snapshot_handle {
  * snapshot_read_next()/snapshot_write_next() is allowed to
  * read/write data after the function returns
  */
-#define data_of(handle)	((handle).buffer + (handle).buf_offset)
+#define data_of(handle)	((handle).buffer)
 
 extern unsigned int snapshot_additional_pages(struct zone *zone);
 extern unsigned long snapshot_get_image_size(void);
-extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
-extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_read_next(struct snapshot_handle *handle);
+extern int snapshot_write_next(struct snapshot_handle *handle);
 extern void snapshot_write_finalize(struct snapshot_handle *handle);
 extern int snapshot_image_loaded(struct snapshot_handle *handle);
 
@@ -154,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
 extern void swsusp_close(fmode_t);
 
+/* kernel/power/block_io.c */
+extern struct block_device *hib_resume_bdev;
+
+extern int hib_bio_read_page(pgoff_t page_off, void *addr,
+		struct bio **bio_chain);
+extern int hib_bio_write_page(pgoff_t page_off, void *addr,
+		struct bio **bio_chain);
+extern int hib_wait_on_bio_chain(struct bio **bio_chain);
+
 struct timeval;
 /* kernel/power/swsusp.c */
 extern void swsusp_show_speed(struct timeval *, struct timeval *,
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index e8b33700627..d52359374e8 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -24,7 +24,7 @@ static void do_poweroff(struct work_struct *dummy)
 
 static DECLARE_WORK(poweroff_work, do_poweroff);
 
-static void handle_poweroff(int key, struct tty_struct *tty)
+static void handle_poweroff(int key)
 {
 	/* run sysrq poweroff on boot cpu */
 	schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5ade1bdcf36..028a99598f4 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -15,6 +15,7 @@
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
 #include <linux/delay.h>
+#include <linux/workqueue.h>
 
 /* 
  * Timeout for stopping processes
@@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only)
 	struct task_struct *g, *p;
 	unsigned long end_time;
 	unsigned int todo;
+	bool wq_busy = false;
 	struct timeval start, end;
 	u64 elapsed_csecs64;
 	unsigned int elapsed_csecs;
@@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only)
 	do_gettimeofday(&start);
 
 	end_time = jiffies + TIMEOUT;
+
+	if (!sig_only)
+		freeze_workqueues_begin();
+
 	while (true) {
 		todo = 0;
 		read_lock(&tasklist_lock);
@@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only)
 				todo++;
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
+
+		if (!sig_only) {
+			wq_busy = freeze_workqueues_busy();
+			todo += wq_busy;
+		}
+
 		if (!todo || time_after(jiffies, end_time))
 			break;
 
@@ -86,14 +98,17 @@ static int try_to_freeze_tasks(bool sig_only)
 		 */
 		printk("\n");
 		printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
-				"(%d tasks refusing to freeze):\n",
-				elapsed_csecs / 100, elapsed_csecs % 100, todo);
-		show_state();
+		       "(%d tasks refusing to freeze, wq_busy=%d):\n",
+		       elapsed_csecs / 100, elapsed_csecs % 100,
+		       todo - wq_busy, wq_busy);
+
+		thaw_workqueues();
+
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			task_lock(p);
 			if (freezing(p) && !freezer_should_skip(p))
-				printk(KERN_ERR " %s\n", p->comm);
+				sched_show_task(p);
 			cancel_freezing(p);
 			task_unlock(p);
 		} while_each_thread(g, p);
@@ -145,7 +160,7 @@ static void thaw_tasks(bool nosig_only)
 		if (nosig_only && should_send_signal(p))
 			continue;
 
-		if (cgroup_frozen(p))
+		if (cgroup_freezing_or_frozen(p))
 			continue;
 
 		thaw_process(p);
@@ -158,6 +173,7 @@ void thaw_processes(void)
 	oom_killer_enable();
 
 	printk("Restarting tasks ... ");
+	thaw_workqueues();
 	thaw_tasks(true);
 	thaw_tasks(false);
 	schedule();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e433..d3f795f01bb 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -3,7 +3,7 @@
  *
  * This file provides system snapshot/restore functionality for swsusp.
  *
- * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
  * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
  *
  * This file is released under the GPLv2.
@@ -26,6 +26,7 @@
 #include <linux/console.h>
 #include <linux/highmem.h>
 #include <linux/list.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1120,9 +1121,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
 	return nr_alloc;
 }
 
-static unsigned long preallocate_image_memory(unsigned long nr_pages)
+static unsigned long preallocate_image_memory(unsigned long nr_pages,
+					      unsigned long avail_normal)
 {
-	return preallocate_image_pages(nr_pages, GFP_IMAGE);
+	unsigned long alloc;
+
+	if (avail_normal <= alloc_normal)
+		return 0;
+
+	alloc = avail_normal - alloc_normal;
+	if (nr_pages < alloc)
+		alloc = nr_pages;
+
+	return preallocate_image_pages(alloc, GFP_IMAGE);
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -1168,20 +1179,27 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
  */
 static void free_unnecessary_pages(void)
 {
-	unsigned long save_highmem, to_free_normal, to_free_highmem;
+	unsigned long save, to_free_normal, to_free_highmem;
 
-	to_free_normal = alloc_normal - count_data_pages();
-	save_highmem = count_highmem_pages();
-	if (alloc_highmem > save_highmem) {
-		to_free_highmem = alloc_highmem - save_highmem;
+	save = count_data_pages();
+	if (alloc_normal >= save) {
+		to_free_normal = alloc_normal - save;
+		save = 0;
+	} else {
+		to_free_normal = 0;
+		save -= alloc_normal;
+	}
+	save += count_highmem_pages();
+	if (alloc_highmem >= save) {
+		to_free_highmem = alloc_highmem - save;
 	} else {
 		to_free_highmem = 0;
-		to_free_normal -= save_highmem - alloc_highmem;
+		to_free_normal -= save - alloc_highmem;
 	}
 
 	memory_bm_position_reset(&copy_bm);
 
-	while (to_free_normal > 0 && to_free_highmem > 0) {
+	while (to_free_normal > 0 || to_free_highmem > 0) {
 		unsigned long pfn = memory_bm_next_pfn(&copy_bm);
 		struct page *page = pfn_to_page(pfn);
 
@@ -1257,7 +1275,7 @@ int hibernate_preallocate_memory(void)
 {
 	struct zone *zone;
 	unsigned long saveable, size, max_size, count, highmem, pages = 0;
-	unsigned long alloc, save_highmem, pages_highmem;
+	unsigned long alloc, save_highmem, pages_highmem, avail_normal;
 	struct timeval start, stop;
 	int error;
 
@@ -1294,6 +1312,7 @@ int hibernate_preallocate_memory(void)
 		else
 			count += zone_page_state(zone, NR_FREE_PAGES);
 	}
+	avail_normal = count;
 	count += highmem;
 	count -= totalreserve_pages;
 
@@ -1308,12 +1327,21 @@ int hibernate_preallocate_memory(void)
 	 */
 	if (size >= saveable) {
 		pages = preallocate_image_highmem(save_highmem);
-		pages += preallocate_image_memory(saveable - pages);
+		pages += preallocate_image_memory(saveable - pages, avail_normal);
 		goto out;
 	}
 
 	/* Estimate the minimum size of the image. */
 	pages = minimum_image_size(saveable);
+	/*
+	 * To avoid excessive pressure on the normal zone, leave room in it to
+	 * accommodate an image of the minimum size (unless it's already too
+	 * small, in which case don't preallocate pages from it at all).
+	 */
+	if (avail_normal > pages)
+		avail_normal -= pages;
+	else
+		avail_normal = 0;
 	if (size < pages)
 		size = min_t(unsigned long, pages, max_size);
 
@@ -1334,16 +1362,34 @@ int hibernate_preallocate_memory(void)
 	 */
 	pages_highmem = preallocate_image_highmem(highmem / 2);
 	alloc = (count - max_size) - pages_highmem;
-	pages = preallocate_image_memory(alloc);
-	if (pages < alloc)
-		goto err_out;
-	size = max_size - size;
-	alloc = size;
-	size = preallocate_highmem_fraction(size, highmem, count);
-	pages_highmem += size;
-	alloc -= size;
-	pages += preallocate_image_memory(alloc);
-	pages += pages_highmem;
+	pages = preallocate_image_memory(alloc, avail_normal);
+	if (pages < alloc) {
+		/* We have exhausted non-highmem pages, try highmem. */
+		alloc -= pages;
+		pages += pages_highmem;
+		pages_highmem = preallocate_image_highmem(alloc);
+		if (pages_highmem < alloc)
+			goto err_out;
+		pages += pages_highmem;
+		/*
+		 * size is the desired number of saveable pages to leave in
+		 * memory, so try to preallocate (all memory - size) pages.
+		 */
+		alloc = (count - pages) - size;
+		pages += preallocate_image_highmem(alloc);
+	} else {
+		/*
+		 * There are approximately max_size saveable pages at this point
+		 * and we want to reduce this number down to size.
+		 */
+		alloc = max_size - size;
+		size = preallocate_highmem_fraction(alloc, highmem, count);
+		pages_highmem += size;
+		alloc -= size;
+		size = preallocate_image_memory(alloc, avail_normal);
+		pages_highmem += preallocate_image_highmem(alloc - size);
+		pages += pages_highmem + size;
+	}
 
 	/*
 	 * We only need as many page frames for the image as there are saveable
@@ -1500,7 +1546,7 @@ asmlinkage int swsusp_save(void)
 {
 	unsigned int nr_pages, nr_highmem;
 
-	printk(KERN_INFO "PM: Creating hibernation image: \n");
+	printk(KERN_INFO "PM: Creating hibernation image:\n");
 
 	drain_local_pages(NULL);
 	nr_pages = count_data_pages();
@@ -1603,14 +1649,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
  *	snapshot_handle structure.  The structure gets updated and a pointer
  *	to it should be passed to this function every next time.
  *
- *	The @count parameter should contain the number of bytes the caller
- *	wants to read from the snapshot.  It must not be zero.
- *
  *	On success the function returns a positive number.  Then, the caller
  *	is allowed to read up to the returned number of bytes from the memory
- *	location computed by the data_of() macro.  The number returned
- *	may be smaller than @count, but this only happens if the read would
- *	cross a page boundary otherwise.
+ *	location computed by the data_of() macro.
  *
  *	The function returns 0 to indicate the end of data stream condition,
  *	and a negative number is returned on error.  In such cases the
@@ -1618,7 +1659,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
  *	any more.
  */
 
-int snapshot_read_next(struct snapshot_handle *handle, size_t count)
+int snapshot_read_next(struct snapshot_handle *handle)
 {
 	if (handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
@@ -1629,7 +1670,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 		if (!buffer)
 			return -ENOMEM;
 	}
-	if (!handle->offset) {
+	if (!handle->cur) {
 		int error;
 
 		error = init_header((struct swsusp_info *)buffer);
@@ -1638,42 +1679,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 		handle->buffer = buffer;
 		memory_bm_position_reset(&orig_bm);
 		memory_bm_position_reset(&copy_bm);
-	}
-	if (handle->prev < handle->cur) {
-		if (handle->cur <= nr_meta_pages) {
-			memset(buffer, 0, PAGE_SIZE);
-			pack_pfns(buffer, &orig_bm);
-		} else {
-			struct page *page;
+	} else if (handle->cur <= nr_meta_pages) {
+		memset(buffer, 0, PAGE_SIZE);
+		pack_pfns(buffer, &orig_bm);
+	} else {
+		struct page *page;
 
-			page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
-			if (PageHighMem(page)) {
-				/* Highmem pages are copied to the buffer,
-				 * because we can't return with a kmapped
-				 * highmem page (we may not be called again).
-				 */
-				void *kaddr;
+		page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
+		if (PageHighMem(page)) {
+			/* Highmem pages are copied to the buffer,
+			 * because we can't return with a kmapped
+			 * highmem page (we may not be called again).
+			 */
+			void *kaddr;
 
-				kaddr = kmap_atomic(page, KM_USER0);
-				memcpy(buffer, kaddr, PAGE_SIZE);
-				kunmap_atomic(kaddr, KM_USER0);
-				handle->buffer = buffer;
-			} else {
-				handle->buffer = page_address(page);
-			}
+			kaddr = kmap_atomic(page, KM_USER0);
+			memcpy(buffer, kaddr, PAGE_SIZE);
+			kunmap_atomic(kaddr, KM_USER0);
+			handle->buffer = buffer;
+		} else {
+			handle->buffer = page_address(page);
 		}
-		handle->prev = handle->cur;
 	}
-	handle->buf_offset = handle->cur_offset;
-	if (handle->cur_offset + count >= PAGE_SIZE) {
-		count = PAGE_SIZE - handle->cur_offset;
-		handle->cur_offset = 0;
-		handle->cur++;
-	} else {
-		handle->cur_offset += count;
-	}
-	handle->offset += count;
-	return count;
+	handle->cur++;
+	return PAGE_SIZE;
 }
 
 /**
@@ -2132,14 +2161,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
  *	snapshot_handle structure.  The structure gets updated and a pointer
  *	to it should be passed to this function every next time.
  *
- *	The @count parameter should contain the number of bytes the caller
- *	wants to write to the image.  It must not be zero.
- *
  *	On success the function returns a positive number.  Then, the caller
  *	is allowed to write up to the returned number of bytes to the memory
- *	location computed by the data_of() macro.  The number returned
- *	may be smaller than @count, but this only happens if the write would
- *	cross a page boundary otherwise.
+ *	location computed by the data_of() macro.
  *
  *	The function returns 0 to indicate the "end of file" condition,
  *	and a negative number is returned on error.  In such cases the
@@ -2147,16 +2171,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
  *	any more.
  */
 
-int snapshot_write_next(struct snapshot_handle *handle, size_t count)
+int snapshot_write_next(struct snapshot_handle *handle)
 {
 	static struct chain_allocator ca;
 	int error = 0;
 
 	/* Check if we have already loaded the entire image */
-	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
+	if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
 
-	if (handle->offset == 0) {
+	handle->sync_read = 1;
+
+	if (!handle->cur) {
 		if (!buffer)
 			/* This makes the buffer be freed by swsusp_free() */
 			buffer = get_image_page(GFP_ATOMIC, PG_ANY);
@@ -2165,56 +2191,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 			return -ENOMEM;
 
 		handle->buffer = buffer;
-	}
-	handle->sync_read = 1;
-	if (handle->prev < handle->cur) {
-		if (handle->prev == 0) {
-			error = load_header(buffer);
-			if (error)
-				return error;
+	} else if (handle->cur == 1) {
+		error = load_header(buffer);
+		if (error)
+			return error;
 
-			error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
-			if (error)
-				return error;
+		error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
+		if (error)
+			return error;
 
-		} else if (handle->prev <= nr_meta_pages) {
-			error = unpack_orig_pfns(buffer, &copy_bm);
+	} else if (handle->cur <= nr_meta_pages + 1) {
+		error = unpack_orig_pfns(buffer, &copy_bm);
+		if (error)
+			return error;
+
+		if (handle->cur == nr_meta_pages + 1) {
+			error = prepare_image(&orig_bm, &copy_bm);
 			if (error)
 				return error;
 
-			if (handle->prev == nr_meta_pages) {
-				error = prepare_image(&orig_bm, &copy_bm);
-				if (error)
-					return error;
-
-				chain_init(&ca, GFP_ATOMIC, PG_SAFE);
-				memory_bm_position_reset(&orig_bm);
-				restore_pblist = NULL;
-				handle->buffer = get_buffer(&orig_bm, &ca);
-				handle->sync_read = 0;
-				if (IS_ERR(handle->buffer))
-					return PTR_ERR(handle->buffer);
-			}
-		} else {
-			copy_last_highmem_page();
+			chain_init(&ca, GFP_ATOMIC, PG_SAFE);
+			memory_bm_position_reset(&orig_bm);
+			restore_pblist = NULL;
 			handle->buffer = get_buffer(&orig_bm, &ca);
+			handle->sync_read = 0;
 			if (IS_ERR(handle->buffer))
 				return PTR_ERR(handle->buffer);
-			if (handle->buffer != buffer)
-				handle->sync_read = 0;
 		}
-		handle->prev = handle->cur;
-	}
-	handle->buf_offset = handle->cur_offset;
-	if (handle->cur_offset + count >= PAGE_SIZE) {
-		count = PAGE_SIZE - handle->cur_offset;
-		handle->cur_offset = 0;
-		handle->cur++;
 	} else {
-		handle->cur_offset += count;
+		copy_last_highmem_page();
+		handle->buffer = get_buffer(&orig_bm, &ca);
+		if (IS_ERR(handle->buffer))
+			return PTR_ERR(handle->buffer);
+		if (handle->buffer != buffer)
+			handle->sync_read = 0;
 	}
-	handle->offset += count;
-	return count;
+	handle->cur++;
+	return PAGE_SIZE;
 }
 
 /**
@@ -2229,7 +2242,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
 {
 	copy_last_highmem_page();
 	/* Free only if we have loaded the image entirely */
-	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
+	if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
 		memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
 		free_highmem_data();
 	}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e..7335952ee47 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -15,6 +15,13 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/gfp.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
 
 #include "power.h"
 
@@ -129,19 +136,19 @@ static int suspend_enter(suspend_state_t state)
 	if (suspend_ops->prepare) {
 		error = suspend_ops->prepare();
 		if (error)
-			return error;
+			goto Platform_finish;
 	}
 
 	error = dpm_suspend_noirq(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down\n");
-		goto Platfrom_finish;
+		goto Platform_finish;
 	}
 
 	if (suspend_ops->prepare_late) {
 		error = suspend_ops->prepare_late();
 		if (error)
-			goto Power_up_devices;
+			goto Platform_wake;
 	}
 
 	if (suspend_test(TEST_PLATFORM))
@@ -156,8 +163,10 @@ static int suspend_enter(suspend_state_t state)
 
 	error = sysdev_suspend(PMSG_SUSPEND);
 	if (!error) {
-		if (!suspend_test(TEST_CORE))
+		if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
 			error = suspend_ops->enter(state);
+			events_check_enabled = false;
+		}
 		sysdev_resume();
 	}
 
@@ -171,10 +180,9 @@ static int suspend_enter(suspend_state_t state)
 	if (suspend_ops->wake)
 		suspend_ops->wake();
 
- Power_up_devices:
 	dpm_resume_noirq(PMSG_RESUME);
 
- Platfrom_finish:
+ Platform_finish:
 	if (suspend_ops->finish)
 		suspend_ops->finish();
 
@@ -189,6 +197,7 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
 	int error;
+	gfp_t saved_mask;
 
 	if (!suspend_ops)
 		return -ENOSYS;
@@ -199,6 +208,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 			goto Close;
 	}
 	suspend_console();
+	saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
 	suspend_test_start();
 	error = dpm_suspend_start(PMSG_SUSPEND);
 	if (error) {
@@ -215,6 +225,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 	suspend_test_start();
 	dpm_resume_end(PMSG_RESUME);
 	suspend_test_finish("resume devices");
+	set_gfp_allowed_mask(saved_mask);
 	resume_console();
  Close:
 	if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9..e6a5bdf61a3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -4,7 +4,7 @@
  * This file provides functions for reading the suspend image from
  * and writing it to a swap partition.
  *
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
  * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
  *
  * This file is released under the GPLv2.
@@ -23,11 +23,46 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pm.h>
+#include <linux/slab.h>
 
 #include "power.h"
 
 #define SWSUSP_SIG	"S1SUSPEND"
 
+/*
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to a swap partition.  It consists of many swap_map_page
+ *	structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
+ *	These structures are stored on the swap and linked together with the
+ *	help of the .next_swap member.
+ *
+ *	The swap map is created during suspend.  The swap map pages are
+ *	allocated and populated one at a time, so we only need one memory
+ *	page to set up the entire structure.
+ *
+ *	During resume we also only need to use one swap_map_page structure
+ *	at a time.
+ */
+
+#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(sector_t) - 1)
+
+struct swap_map_page {
+	sector_t entries[MAP_PAGE_ENTRIES];
+	sector_t next_swap;
+};
+
+/**
+ *	The swap_map_handle structure is used for handling swap in
+ *	a file-alike way
+ */
+
+struct swap_map_handle {
+	struct swap_map_page *cur;
+	sector_t cur_swap;
+	sector_t first_sector;
+	unsigned int k;
+};
+
 struct swsusp_header {
 	char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
 	sector_t image;
@@ -113,7 +148,7 @@ sector_t alloc_swapdev_block(int swap)
 
 /**
  *	free_all_swap_pages - free swap pages allocated for saving image data.
- *	It also frees the extents used to register which swap entres had been
+ *	It also frees the extents used to register which swap entries had been
  *	allocated.
  */
 
@@ -144,110 +179,24 @@ int swsusp_swap_in_use(void)
  */
 
 static unsigned short root_swap = 0xffff;
-static struct block_device *resume_bdev;
-
-/**
- *	submit - submit BIO request.
- *	@rw:	READ or WRITE.
- *	@off	physical offset of page.
- *	@page:	page we're reading or writing.
- *	@bio_chain: list of pending biod (for async reading)
- *
- *	Straight from the textbook - allocate and initialize the bio.
- *	If we're reading, make sure the page is marked as dirty.
- *	Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, pgoff_t page_off, struct page *page,
-			struct bio **bio_chain)
-{
-	const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
-	struct bio *bio;
-
-	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
-	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
-	bio->bi_bdev = resume_bdev;
-	bio->bi_end_io = end_swap_bio_read;
-
-	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
-			page_off);
-		bio_put(bio);
-		return -EFAULT;
-	}
-
-	lock_page(page);
-	bio_get(bio);
-
-	if (bio_chain == NULL) {
-		submit_bio(bio_rw, bio);
-		wait_on_page_locked(page);
-		if (rw == READ)
-			bio_set_pages_dirty(bio);
-		bio_put(bio);
-	} else {
-		if (rw == READ)
-			get_page(page);	/* These pages are freed later */
-		bio->bi_private = *bio_chain;
-		*bio_chain = bio;
-		submit_bio(bio_rw, bio);
-	}
-	return 0;
-}
-
-static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-	return submit(READ, page_off, virt_to_page(addr), bio_chain);
-}
-
-static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-	return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
-}
-
-static int wait_on_bio_chain(struct bio **bio_chain)
-{
-	struct bio *bio;
-	struct bio *next_bio;
-	int ret = 0;
-
-	if (bio_chain == NULL)
-		return 0;
-
-	bio = *bio_chain;
-	if (bio == NULL)
-		return 0;
-	while (bio) {
-		struct page *page;
-
-		next_bio = bio->bi_private;
-		page = bio->bi_io_vec[0].bv_page;
-		wait_on_page_locked(page);
-		if (!PageUptodate(page) || PageError(page))
-			ret = -EIO;
-		put_page(page);
-		bio_put(bio);
-		bio = next_bio;
-	}
-	*bio_chain = NULL;
-	return ret;
-}
+struct block_device *hib_resume_bdev;
 
 /*
  * Saving part
  */
 
-static int mark_swapfiles(sector_t start, unsigned int flags)
+static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 {
 	int error;
 
-	bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+	hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
 		memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
-		swsusp_header->image = start;
+		swsusp_header->image = handle->first_sector;
 		swsusp_header->flags = flags;
-		error = bio_write_page(swsusp_resume_block,
+		error = hib_bio_write_page(swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Swap header not found!\n");
@@ -259,25 +208,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
 /**
  *	swsusp_swap_check - check if the resume device is a swap device
  *	and get its index (if so)
+ *
+ *	This is called before saving image
  */
-
-static int swsusp_swap_check(void) /* This is called before saving image */
+static int swsusp_swap_check(void)
 {
 	int res;
 
 	res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
-			&resume_bdev);
+			&hib_resume_bdev);
 	if (res < 0)
 		return res;
 
 	root_swap = res;
-	res = blkdev_get(resume_bdev, FMODE_WRITE);
+	res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
 	if (res)
 		return res;
 
-	res = set_blocksize(resume_bdev, PAGE_SIZE);
+	res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
 	if (res < 0)
-		blkdev_put(resume_bdev, FMODE_WRITE);
+		blkdev_put(hib_resume_bdev, FMODE_WRITE);
 
 	return res;
 }
@@ -308,42 +258,9 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 	} else {
 		src = buf;
 	}
-	return bio_write_page(offset, src, bio_chain);
+	return hib_bio_write_page(offset, src, bio_chain);
 }
 
-/*
- *	The swap map is a data structure used for keeping track of each page
- *	written to a swap partition.  It consists of many swap_map_page
- *	structures that contain each an array of MAP_PAGE_SIZE swap entries.
- *	These structures are stored on the swap and linked together with the
- *	help of the .next_swap member.
- *
- *	The swap map is created during suspend.  The swap map pages are
- *	allocated and populated one at a time, so we only need one memory
- *	page to set up the entire structure.
- *
- *	During resume we also only need to use one swap_map_page structure
- *	at a time.
- */
-
-#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(sector_t) - 1)
-
-struct swap_map_page {
-	sector_t entries[MAP_PAGE_ENTRIES];
-	sector_t next_swap;
-};
-
-/**
- *	The swap_map_handle structure is used for handling swap in
- *	a file-alike way
- */
-
-struct swap_map_handle {
-	struct swap_map_page *cur;
-	sector_t cur_swap;
-	unsigned int k;
-};
-
 static void release_swap_writer(struct swap_map_handle *handle)
 {
 	if (handle->cur)
@@ -353,16 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle)
 
 static int get_swap_writer(struct swap_map_handle *handle)
 {
+	int ret;
+
+	ret = swsusp_swap_check();
+	if (ret) {
+		if (ret != -ENOSPC)
+			printk(KERN_ERR "PM: Cannot find swap device, try "
+					"swapon -a.\n");
+		return ret;
+	}
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
-	if (!handle->cur)
-		return -ENOMEM;
+	if (!handle->cur) {
+		ret = -ENOMEM;
+		goto err_close;
+	}
 	handle->cur_swap = alloc_swapdev_block(root_swap);
 	if (!handle->cur_swap) {
-		release_swap_writer(handle);
-		return -ENOSPC;
+		ret = -ENOSPC;
+		goto err_rel;
 	}
 	handle->k = 0;
+	handle->first_sector = handle->cur_swap;
 	return 0;
+err_rel:
+	release_swap_writer(handle);
+err_close:
+	swsusp_close(FMODE_WRITE);
+	return ret;
 }
 
 static int swap_write_page(struct swap_map_handle *handle, void *buf,
@@ -379,7 +313,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		return error;
 	handle->cur->entries[handle->k++] = offset;
 	if (handle->k >= MAP_PAGE_ENTRIES) {
-		error = wait_on_bio_chain(bio_chain);
+		error = hib_wait_on_bio_chain(bio_chain);
 		if (error)
 			goto out;
 		offset = alloc_swapdev_block(root_swap);
@@ -405,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle)
 		return -EINVAL;
 }
 
+static int swap_writer_finish(struct swap_map_handle *handle,
+		unsigned int flags, int error)
+{
+	if (!error) {
+		flush_swap_writer(handle);
+		printk(KERN_INFO "PM: S");
+		error = mark_swapfiles(handle, flags);
+		printk("|\n");
+	}
+
+	if (error)
+		free_all_swap_pages(root_swap);
+	release_swap_writer(handle);
+	swsusp_close(FMODE_WRITE);
+
+	return error;
+}
+
 /**
  *	save_image - save the suspend image data
  */
@@ -430,7 +382,7 @@ static int save_image(struct swap_map_handle *handle,
 	bio = NULL;
 	do_gettimeofday(&start);
 	while (1) {
-		ret = snapshot_read_next(snapshot, PAGE_SIZE);
+		ret = snapshot_read_next(snapshot);
 		if (ret <= 0)
 			break;
 		ret = swap_write_page(handle, data_of(*snapshot), &bio);
@@ -440,7 +392,7 @@ static int save_image(struct swap_map_handle *handle,
 			printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
 		nr_pages++;
 	}
-	err2 = wait_on_bio_chain(&bio);
+	err2 = hib_wait_on_bio_chain(&bio);
 	do_gettimeofday(&stop);
 	if (!ret)
 		ret = err2;
@@ -482,50 +434,34 @@ int swsusp_write(unsigned int flags)
 	struct swap_map_handle handle;
 	struct snapshot_handle snapshot;
 	struct swsusp_info *header;
+	unsigned long pages;
 	int error;
 
-	error = swsusp_swap_check();
+	pages = snapshot_get_image_size();
+	error = get_swap_writer(&handle);
 	if (error) {
-		printk(KERN_ERR "PM: Cannot find swap device, try "
-				"swapon -a.\n");
+		printk(KERN_ERR "PM: Cannot get swap writer\n");
 		return error;
 	}
+	if (!enough_swap(pages)) {
+		printk(KERN_ERR "PM: Not enough free swap\n");
+		error = -ENOSPC;
+		goto out_finish;
+	}
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
-	error = snapshot_read_next(&snapshot, PAGE_SIZE);
+	error = snapshot_read_next(&snapshot);
 	if (error < PAGE_SIZE) {
 		if (error >= 0)
 			error = -EFAULT;
 
-		goto out;
+		goto out_finish;
 	}
 	header = (struct swsusp_info *)data_of(snapshot);
-	if (!enough_swap(header->pages)) {
-		printk(KERN_ERR "PM: Not enough free swap\n");
-		error = -ENOSPC;
-		goto out;
-	}
-	error = get_swap_writer(&handle);
-	if (!error) {
-		sector_t start = handle.cur_swap;
-
-		error = swap_write_page(&handle, header, NULL);
-		if (!error)
-			error = save_image(&handle, &snapshot,
-					header->pages - 1);
-
-		if (!error) {
-			flush_swap_writer(&handle);
-			printk(KERN_INFO "PM: S");
-			error = mark_swapfiles(start, flags);
-			printk("|\n");
-		}
-	}
-	if (error)
-		free_all_swap_pages(root_swap);
-
-	release_swap_writer(&handle);
- out:
-	swsusp_close(FMODE_WRITE);
+	error = swap_write_page(&handle, header, NULL);
+	if (!error)
+		error = save_image(&handle, &snapshot, pages - 1);
+out_finish:
+	error = swap_writer_finish(&handle, flags, error);
 	return error;
 }
 
@@ -541,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle)
 	handle->cur = NULL;
 }
 
-static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
+static int get_swap_reader(struct swap_map_handle *handle,
+		unsigned int *flags_p)
 {
 	int error;
 
-	if (!start)
+	*flags_p = swsusp_header->flags;
+
+	if (!swsusp_header->image) /* how can this happen? */
 		return -EINVAL;
 
 	handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
 	if (!handle->cur)
 		return -ENOMEM;
 
-	error = bio_read_page(start, handle->cur, NULL);
+	error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
 	if (error) {
 		release_swap_reader(handle);
 		return error;
@@ -572,21 +511,28 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 	offset = handle->cur->entries[handle->k];
 	if (!offset)
 		return -EFAULT;
-	error = bio_read_page(offset, buf, bio_chain);
+	error = hib_bio_read_page(offset, buf, bio_chain);
 	if (error)
 		return error;
 	if (++handle->k >= MAP_PAGE_ENTRIES) {
-		error = wait_on_bio_chain(bio_chain);
+		error = hib_wait_on_bio_chain(bio_chain);
 		handle->k = 0;
 		offset = handle->cur->next_swap;
 		if (!offset)
 			release_swap_reader(handle);
 		else if (!error)
-			error = bio_read_page(offset, handle->cur, NULL);
+			error = hib_bio_read_page(offset, handle->cur, NULL);
 	}
 	return error;
 }
 
+static int swap_reader_finish(struct swap_map_handle *handle)
+{
+	release_swap_reader(handle);
+
+	return 0;
+}
+
 /**
  *	load_image - load the image using the swap map handle
  *	@handle and the snapshot handle @snapshot
@@ -614,21 +560,21 @@ static int load_image(struct swap_map_handle *handle,
 	bio = NULL;
 	do_gettimeofday(&start);
 	for ( ; ; ) {
-		error = snapshot_write_next(snapshot, PAGE_SIZE);
+		error = snapshot_write_next(snapshot);
 		if (error <= 0)
 			break;
 		error = swap_read_page(handle, data_of(*snapshot), &bio);
 		if (error)
 			break;
 		if (snapshot->sync_read)
-			error = wait_on_bio_chain(&bio);
+			error = hib_wait_on_bio_chain(&bio);
 		if (error)
 			break;
 		if (!(nr_pages % m))
 			printk("\b\b\b\b%3d%%", nr_pages / m);
 		nr_pages++;
 	}
-	err2 = wait_on_bio_chain(&bio);
+	err2 = hib_wait_on_bio_chain(&bio);
 	do_gettimeofday(&stop);
 	if (!error)
 		error = err2;
@@ -656,24 +602,20 @@ int swsusp_read(unsigned int *flags_p)
 	struct snapshot_handle snapshot;
 	struct swsusp_info *header;
 
-	*flags_p = swsusp_header->flags;
-	if (IS_ERR(resume_bdev)) {
-		pr_debug("PM: Image device not initialised\n");
-		return PTR_ERR(resume_bdev);
-	}
-
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
-	error = snapshot_write_next(&snapshot, PAGE_SIZE);
+	error = snapshot_write_next(&snapshot);
 	if (error < PAGE_SIZE)
 		return error < 0 ? error : -EFAULT;
 	header = (struct swsusp_info *)data_of(snapshot);
-	error = get_swap_reader(&handle, swsusp_header->image);
+	error = get_swap_reader(&handle, flags_p);
+	if (error)
+		goto end;
 	if (!error)
 		error = swap_read_page(&handle, header, NULL);
 	if (!error)
 		error = load_image(&handle, &snapshot, header->pages - 1);
-	release_swap_reader(&handle);
-
+	swap_reader_finish(&handle);
+end:
 	if (!error)
 		pr_debug("PM: Image successfully loaded\n");
 	else
@@ -689,11 +631,11 @@ int swsusp_check(void)
 {
 	int error;
 
-	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
-	if (!IS_ERR(resume_bdev)) {
-		set_blocksize(resume_bdev, PAGE_SIZE);
+	hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+	if (!IS_ERR(hib_resume_bdev)) {
+		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		memset(swsusp_header, 0, PAGE_SIZE);
-		error = bio_read_page(swsusp_resume_block,
+		error = hib_bio_read_page(swsusp_resume_block,
 					swsusp_header, NULL);
 		if (error)
 			goto put;
@@ -701,7 +643,7 @@ int swsusp_check(void)
 		if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
 			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
 			/* Reset swap signature now */
-			error = bio_write_page(swsusp_resume_block,
+			error = hib_bio_write_page(swsusp_resume_block,
 						swsusp_header, NULL);
 		} else {
 			error = -EINVAL;
@@ -709,11 +651,11 @@ int swsusp_check(void)
 
 put:
 		if (error)
-			blkdev_put(resume_bdev, FMODE_READ);
+			blkdev_put(hib_resume_bdev, FMODE_READ);
 		else
 			pr_debug("PM: Signature found, resuming\n");
 	} else {
-		error = PTR_ERR(resume_bdev);
+		error = PTR_ERR(hib_resume_bdev);
 	}
 
 	if (error)
@@ -728,12 +670,12 @@ put:
 
 void swsusp_close(fmode_t mode)
 {
-	if (IS_ERR(resume_bdev)) {
+	if (IS_ERR(hib_resume_bdev)) {
 		pr_debug("PM: Image device not initialised\n");
 		return;
 	}
 
-	blkdev_put(resume_bdev, mode);
+	blkdev_put(hib_resume_bdev, mode);
 }
 
 static int swsusp_header_init(void)
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd189..00000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * linux/kernel/power/swsusp.c
- *
- * This file provides code to write suspend image to swap and read it back.
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
- *
- * This file is released under the GPLv2.
- *
- * I'd like to thank the following people for their work:
- *
- * Pavel Machek <pavel@ucw.cz>:
- * Modifications, defectiveness pointing, being with me at the very beginning,
- * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
- *
- * Steve Doddi <dirk@loth.demon.co.uk>:
- * Support the possibility of hardware state restoring.
- *
- * Raph <grey.havens@earthling.net>:
- * Support for preserving states of network devices and virtual console
- * (including X and svgatextmode)
- *
- * Kurt Garloff <garloff@suse.de>:
- * Straightened the critical function in order to prevent compilers from
- * playing tricks with local variables.
- *
- * Andreas Mohr <a.mohr@mailto.de>
- *
- * Alex Badea <vampire@go.ro>:
- * Fixed runaway init
- *
- * Rafael J. Wysocki <rjw@sisk.pl>
- * Reworked the freeing of memory and the handling of swap
- *
- * More state savers are welcome. Especially for the scsi layer...
- *
- * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
- */
-
-#include <linux/mm.h>
-#include <linux/suspend.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/swap.h>
-#include <linux/pm.h>
-#include <linux/swapops.h>
-#include <linux/bootmem.h>
-#include <linux/syscalls.h>
-#include <linux/highmem.h>
-#include <linux/time.h>
-#include <linux/rbtree.h>
-#include <linux/io.h>
-
-#include "power.h"
-
-int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f..e819e17877c 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
 {
 	struct snapshot_data *data;
 	ssize_t res;
+	loff_t pg_offp = *offp & ~PAGE_MASK;
 
 	mutex_lock(&pm_mutex);
 
@@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
 		res = -ENODATA;
 		goto Unlock;
 	}
-	res = snapshot_read_next(&data->handle, count);
-	if (res > 0) {
-		if (copy_to_user(buf, data_of(data->handle), res))
-			res = -EFAULT;
-		else
-			*offp = data->handle.offset;
+	if (!pg_offp) { /* on page boundary? */
+		res = snapshot_read_next(&data->handle);
+		if (res <= 0)
+			goto Unlock;
+	} else {
+		res = PAGE_SIZE - pg_offp;
 	}
 
+	res = simple_read_from_buffer(buf, count, &pg_offp,
+			data_of(data->handle), res);
+	if (res > 0)
+		*offp += res;
+
  Unlock:
 	mutex_unlock(&pm_mutex);
 
@@ -178,23 +184,39 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 {
 	struct snapshot_data *data;
 	ssize_t res;
+	loff_t pg_offp = *offp & ~PAGE_MASK;
 
 	mutex_lock(&pm_mutex);
 
 	data = filp->private_data;
-	res = snapshot_write_next(&data->handle, count);
-	if (res > 0) {
-		if (copy_from_user(data_of(data->handle), buf, res))
-			res = -EFAULT;
-		else
-			*offp = data->handle.offset;
+
+	if (!pg_offp) {
+		res = snapshot_write_next(&data->handle);
+		if (res <= 0)
+			goto unlock;
+	} else {
+		res = PAGE_SIZE - pg_offp;
 	}
 
+	res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
+			buf, count);
+	if (res > 0)
+		*offp += res;
+unlock:
 	mutex_unlock(&pm_mutex);
 
 	return res;
 }
 
+static void snapshot_deprecated_ioctl(unsigned int cmd)
+{
+	if (printk_ratelimit())
+		printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
+				"be removed soon, update your suspend-to-disk "
+				"utilities\n",
+				__builtin_return_address(0), cmd);
+}
+
 static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 							unsigned long arg)
 {
@@ -246,8 +268,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		data->frozen = 0;
 		break;
 
-	case SNAPSHOT_CREATE_IMAGE:
 	case SNAPSHOT_ATOMIC_SNAPSHOT:
+		snapshot_deprecated_ioctl(cmd);
+	case SNAPSHOT_CREATE_IMAGE:
 		if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
 			error = -EPERM;
 			break;
@@ -275,8 +298,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		data->ready = 0;
 		break;
 
-	case SNAPSHOT_PREF_IMAGE_SIZE:
 	case SNAPSHOT_SET_IMAGE_SIZE:
+		snapshot_deprecated_ioctl(cmd);
+	case SNAPSHOT_PREF_IMAGE_SIZE:
 		image_size = arg;
 		break;
 
@@ -290,15 +314,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		error = put_user(size, (loff_t __user *)arg);
 		break;
 
-	case SNAPSHOT_AVAIL_SWAP_SIZE:
 	case SNAPSHOT_AVAIL_SWAP:
+		snapshot_deprecated_ioctl(cmd);
+	case SNAPSHOT_AVAIL_SWAP_SIZE:
 		size = count_swap_pages(data->swap, 1);
 		size <<= PAGE_SHIFT;
 		error = put_user(size, (loff_t __user *)arg);
 		break;
 
-	case SNAPSHOT_ALLOC_SWAP_PAGE:
 	case SNAPSHOT_GET_SWAP_PAGE:
+		snapshot_deprecated_ioctl(cmd);
+	case SNAPSHOT_ALLOC_SWAP_PAGE:
 		if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
 			error = -ENODEV;
 			break;
@@ -321,6 +347,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		break;
 
 	case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
+		snapshot_deprecated_ioctl(cmd);
 		if (!swsusp_swap_in_use()) {
 			/*
 			 * User space encodes device types as two-byte values,
@@ -362,6 +389,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		break;
 
 	case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
+		snapshot_deprecated_ioctl(cmd);
 		error = -EINVAL;
 
 		switch (arg) {
@@ -405,7 +433,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 			 * User space encodes device types as two-byte values,
 			 * so we need to recode them
 			 */
-			swdev = old_decode_dev(swap_area.dev);
+			swdev = new_decode_dev(swap_area.dev);
 			if (swdev) {
 				offset = swap_area.offset;
 				data->swap = swap_type_of(swdev, offset, NULL);
diff --git a/kernel/printk.c b/kernel/printk.c
index 1751c456b71..8fe465ac008 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,8 +33,12 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
+#include <linux/kdb.h>
 #include <linux/ratelimit.h>
 #include <linux/kmsg_dump.h>
+#include <linux/syslog.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
 
 #include <asm/uaccess.h>
 
@@ -69,8 +73,6 @@ int console_printk[4] = {
 	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */
 };
 
-static int saved_console_loglevel = -1;
-
 /*
  * Low level drivers may need that to know if they can schedule in
  * their unblank() callback or not. So let's export it.
@@ -145,6 +147,7 @@ static char __log_buf[__LOG_BUF_LEN];
 static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
 static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
+static int saved_console_loglevel = -1;
 
 #ifdef CONFIG_KEXEC
 /*
@@ -258,38 +261,23 @@ static inline void boot_delay_msec(void)
 }
 #endif
 
-/*
- * Commands to do_syslog:
- *
- * 	0 -- Close the log.  Currently a NOP.
- * 	1 -- Open the log. Currently a NOP.
- * 	2 -- Read from the log.
- * 	3 -- Read all messages remaining in the ring buffer.
- * 	4 -- Read and clear all messages remaining in the ring buffer
- * 	5 -- Clear ring buffer.
- * 	6 -- Disable printk's to console
- * 	7 -- Enable printk's to console
- *	8 -- Set level of messages printed to console
- *	9 -- Return number of unread characters in the log buffer
- *     10 -- Return size of the log buffer
- */
-int do_syslog(int type, char __user *buf, int len)
+int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
 	unsigned i, j, limit, count;
 	int do_clear = 0;
 	char c;
 	int error = 0;
 
-	error = security_syslog(type);
+	error = security_syslog(type, from_file);
 	if (error)
 		return error;
 
 	switch (type) {
-	case 0:		/* Close log */
+	case SYSLOG_ACTION_CLOSE:	/* Close log */
 		break;
-	case 1:		/* Open log */
+	case SYSLOG_ACTION_OPEN:	/* Open log */
 		break;
-	case 2:		/* Read from log */
+	case SYSLOG_ACTION_READ:	/* Read from log */
 		error = -EINVAL;
 		if (!buf || len < 0)
 			goto out;
@@ -320,10 +308,12 @@ int do_syslog(int type, char __user *buf, int len)
 		if (!error)
 			error = i;
 		break;
-	case 4:		/* Read/clear last kernel messages */
+	/* Read/clear last kernel messages */
+	case SYSLOG_ACTION_READ_CLEAR:
 		do_clear = 1;
 		/* FALL THRU */
-	case 3:		/* Read last kernel messages */
+	/* Read last kernel messages */
+	case SYSLOG_ACTION_READ_ALL:
 		error = -EINVAL;
 		if (!buf || len < 0)
 			goto out;
@@ -376,21 +366,25 @@ int do_syslog(int type, char __user *buf, int len)
 			}
 		}
 		break;
-	case 5:		/* Clear ring buffer */
+	/* Clear ring buffer */
+	case SYSLOG_ACTION_CLEAR:
 		logged_chars = 0;
 		break;
-	case 6:		/* Disable logging to console */
+	/* Disable logging to console */
+	case SYSLOG_ACTION_CONSOLE_OFF:
 		if (saved_console_loglevel == -1)
 			saved_console_loglevel = console_loglevel;
 		console_loglevel = minimum_console_loglevel;
 		break;
-	case 7:		/* Enable logging to console */
+	/* Enable logging to console */
+	case SYSLOG_ACTION_CONSOLE_ON:
 		if (saved_console_loglevel != -1) {
 			console_loglevel = saved_console_loglevel;
 			saved_console_loglevel = -1;
 		}
 		break;
-	case 8:		/* Set level of messages printed to console */
+	/* Set level of messages printed to console */
+	case SYSLOG_ACTION_CONSOLE_LEVEL:
 		error = -EINVAL;
 		if (len < 1 || len > 8)
 			goto out;
@@ -401,10 +395,12 @@ int do_syslog(int type, char __user *buf, int len)
 		saved_console_loglevel = -1;
 		error = 0;
 		break;
-	case 9:		/* Number of chars in the log buffer */
+	/* Number of chars in the log buffer */
+	case SYSLOG_ACTION_SIZE_UNREAD:
 		error = log_end - log_start;
 		break;
-	case 10:	/* Size of the log buffer */
+	/* Size of the log buffer */
+	case SYSLOG_ACTION_SIZE_BUFFER:
 		error = log_buf_len;
 		break;
 	default:
@@ -417,9 +413,25 @@ out:
 
 SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 {
-	return do_syslog(type, buf, len);
+	return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
 
+#ifdef	CONFIG_KGDB_KDB
+/* kdb dmesg command needs access to the syslog buffer.  do_syslog()
+ * uses locks so it cannot be used during debugging.  Just tell kdb
+ * where the start and end of the physical and logical logs are.  This
+ * is equivalent to do_syslog(3).
+ */
+void kdb_syslog_data(char *syslog_data[4])
+{
+	syslog_data[0] = log_buf;
+	syslog_data[1] = log_buf + log_buf_len;
+	syslog_data[2] = log_buf + log_end -
+		(logged_chars < log_buf_len ? logged_chars : log_buf_len);
+	syslog_data[3] = log_buf + log_end;
+}
+#endif	/* CONFIG_KGDB_KDB */
+
 /*
  * Call the console drivers on a range of log_buf
  */
@@ -593,6 +605,14 @@ asmlinkage int printk(const char *fmt, ...)
 	va_list args;
 	int r;
 
+#ifdef CONFIG_KGDB_KDB
+	if (unlikely(kdb_trap_printk)) {
+		va_start(args, fmt);
+		r = vkdb_printf(fmt, args);
+		va_end(args);
+		return r;
+	}
+#endif
 	va_start(args, fmt);
 	r = vprintk(fmt, args);
 	va_end(args);
@@ -967,6 +987,32 @@ void resume_console(void)
 }
 
 /**
+ * console_cpu_notify - print deferred console messages after CPU hotplug
+ * @self: notifier struct
+ * @action: CPU hotplug event
+ * @hcpu: unused
+ *
+ * If printk() is called from a CPU that is not online yet, the messages
+ * will be spooled but will not show up on the console.  This function is
+ * called when a new CPU comes online (or fails to come up), and ensures
+ * that any such output gets printed.
+ */
+static int __cpuinit console_cpu_notify(struct notifier_block *self,
+	unsigned long action, void *hcpu)
+{
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DEAD:
+	case CPU_DYING:
+	case CPU_DOWN_FAILED:
+	case CPU_UP_CANCELED:
+		acquire_console_sem();
+		release_console_sem();
+	}
+	return NOTIFY_OK;
+}
+
+/**
  * acquire_console_sem - lock the console system for exclusive use.
  *
  * Acquires a semaphore which guarantees that the caller has
@@ -1353,7 +1399,7 @@ int unregister_console(struct console *console)
 }
 EXPORT_SYMBOL(unregister_console);
 
-static int __init disable_boot_consoles(void)
+static int __init printk_late_init(void)
 {
 	struct console *con;
 
@@ -1364,9 +1410,10 @@ static int __init disable_boot_consoles(void)
 			unregister_console(con);
 		}
 	}
+	hotcpu_notifier(console_cpu_notify, 0);
 	return 0;
 }
-late_initcall(disable_boot_consoles);
+late_initcall(printk_late_init);
 
 #if defined CONFIG_PRINTK
 
@@ -1502,9 +1549,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 	chars = logged_chars;
 	spin_unlock_irqrestore(&logbuf_lock, flags);
 
-	if (logged_chars > end) {
-		s1 = log_buf + log_buf_len - logged_chars + end;
-		l1 = logged_chars - end;
+	if (chars > end) {
+		s1 = log_buf + log_buf_len - chars + end;
+		l1 = chars - end;
 
 		s2 = log_buf;
 		l2 = end;
@@ -1512,8 +1559,8 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 		s1 = "";
 		l1 = 0;
 
-		s2 = log_buf + end - logged_chars;
-		l2 = logged_chars;
+		s2 = log_buf + end - chars;
+		l2 = chars;
 	}
 
 	if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae..b22a899934c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
 		return 0;
 
 	prof_buffer = vmalloc(buffer_bytes);
-	if (prof_buffer)
+	if (prof_buffer) {
+		memset(prof_buffer, 0, buffer_bytes);
 		return 0;
+	}
 
 	free_cpumask_var(prof_cpu_mask);
 	return -ENOMEM;
@@ -363,14 +365,14 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		node = cpu_to_node(cpu);
+		node = cpu_to_mem(cpu);
 		per_cpu(cpu_profile_flip, cpu) = 0;
 		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
 			page = alloc_pages_exact_node(node,
 					GFP_KERNEL | __GFP_ZERO,
 					0);
 			if (!page)
-				return NOTIFY_BAD;
+				return notifier_from_errno(-ENOMEM);
 			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
 		}
 		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
@@ -386,7 +388,7 @@ out_free:
 		page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
 		per_cpu(cpu_profile_hits, cpu)[1] = NULL;
 		__free_page(page);
-		return NOTIFY_BAD;
+		return notifier_from_errno(-ENOMEM);
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		if (prof_cpu_mask != NULL)
@@ -565,7 +567,7 @@ static int create_hash_tables(void)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		int node = cpu_to_node(cpu);
+		int node = cpu_to_mem(cpu);
 		struct page *page;
 
 		page = alloc_pages_exact_node(node,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042..f34d798ef4a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/signal.h>
@@ -22,6 +21,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/regset.h>
 
 
 /*
@@ -75,7 +75,6 @@ void __ptrace_unlink(struct task_struct *child)
 	child->parent = child->real_parent;
 	list_del_init(&child->ptrace_entry);
 
-	arch_ptrace_untrace(child);
 	if (task_is_traced(child))
 		ptrace_untrace(child);
 }
@@ -325,26 +324,32 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 }
 
 /*
- * Detach all tasks we were using ptrace on.
+ * Detach all tasks we were using ptrace on. Called with tasklist held
+ * for writing, and returns with it held too. But note it can release
+ * and reacquire the lock.
  */
 void exit_ptrace(struct task_struct *tracer)
 {
 	struct task_struct *p, *n;
 	LIST_HEAD(ptrace_dead);
 
-	write_lock_irq(&tasklist_lock);
+	if (likely(list_empty(&tracer->ptraced)))
+		return;
+
 	list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
 		if (__ptrace_detach(tracer, p))
 			list_add(&p->ptrace_entry, &ptrace_dead);
 	}
-	write_unlock_irq(&tasklist_lock);
 
+	write_unlock_irq(&tasklist_lock);
 	BUG_ON(!list_empty(&tracer->ptraced));
 
 	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
 		list_del_init(&p->ptrace_entry);
 		release_task(p);
 	}
+
+	write_lock_irq(&tasklist_lock);
 }
 
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
@@ -511,6 +516,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
 	return 0;
 }
 
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+
+static const struct user_regset *
+find_regset(const struct user_regset_view *view, unsigned int type)
+{
+	const struct user_regset *regset;
+	int n;
+
+	for (n = 0; n < view->n; ++n) {
+		regset = view->regsets + n;
+		if (regset->core_note_type == type)
+			return regset;
+	}
+
+	return NULL;
+}
+
+static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
+			 struct iovec *kiov)
+{
+	const struct user_regset_view *view = task_user_regset_view(task);
+	const struct user_regset *regset = find_regset(view, type);
+	int regset_no;
+
+	if (!regset || (kiov->iov_len % regset->size) != 0)
+		return -EINVAL;
+
+	regset_no = regset - view->regsets;
+	kiov->iov_len = min(kiov->iov_len,
+			    (__kernel_size_t) (regset->n * regset->size));
+
+	if (req == PTRACE_GETREGSET)
+		return copy_regset_to_user(task, view, regset_no, 0,
+					   kiov->iov_len, kiov->iov_base);
+	else
+		return copy_regset_from_user(task, view, regset_no, 0,
+					     kiov->iov_len, kiov->iov_base);
+}
+
+#endif
+
 int ptrace_request(struct task_struct *child, long request,
 		   long addr, long data)
 {
@@ -554,6 +600,32 @@ int ptrace_request(struct task_struct *child, long request,
 		ret = ptrace_detach(child, data);
 		break;
 
+#ifdef CONFIG_BINFMT_ELF_FDPIC
+	case PTRACE_GETFDPIC: {
+		struct mm_struct *mm = get_task_mm(child);
+		unsigned long tmp = 0;
+
+		ret = -ESRCH;
+		if (!mm)
+			break;
+
+		switch (addr) {
+		case PTRACE_GETFDPIC_EXEC:
+			tmp = mm->context.exec_fdpic_loadmap;
+			break;
+		case PTRACE_GETFDPIC_INTERP:
+			tmp = mm->context.interp_fdpic_loadmap;
+			break;
+		default:
+			break;
+		}
+		mmput(mm);
+
+		ret = put_user(tmp, (unsigned long __user *) data);
+		break;
+	}
+#endif
+
 #ifdef PTRACE_SINGLESTEP
 	case PTRACE_SINGLESTEP:
 #endif
@@ -573,6 +645,26 @@ int ptrace_request(struct task_struct *child, long request,
 			return 0;
 		return ptrace_resume(child, request, SIGKILL);
 
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+	case PTRACE_GETREGSET:
+	case PTRACE_SETREGSET:
+	{
+		struct iovec kiov;
+		struct iovec __user *uiov = (struct iovec __user *) data;
+
+		if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+			return -EFAULT;
+
+		if (__get_user(kiov.iov_base, &uiov->iov_base) ||
+		    __get_user(kiov.iov_len, &uiov->iov_len))
+			return -EFAULT;
+
+		ret = ptrace_regset(child, request, addr, &kiov);
+		if (!ret)
+			ret = __put_user(kiov.iov_len, &uiov->iov_len);
+		break;
+	}
+#endif
 	default:
 		break;
 	}
@@ -604,10 +696,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
 	struct task_struct *child;
 	long ret;
 
-	/*
-	 * This lock_kernel fixes a subtle race with suid exec
-	 */
-	lock_kernel();
 	if (request == PTRACE_TRACEME) {
 		ret = ptrace_traceme();
 		if (!ret)
@@ -641,7 +729,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
  out_put_task_struct:
 	put_task_struct(child);
  out:
-	unlock_kernel();
 	return ret;
 }
 
@@ -711,6 +798,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 		else
 			ret = ptrace_setsiginfo(child, &siginfo);
 		break;
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+	case PTRACE_GETREGSET:
+	case PTRACE_SETREGSET:
+	{
+		struct iovec kiov;
+		struct compat_iovec __user *uiov =
+			(struct compat_iovec __user *) datap;
+		compat_uptr_t ptr;
+		compat_size_t len;
+
+		if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+			return -EFAULT;
+
+		if (__get_user(ptr, &uiov->iov_base) ||
+		    __get_user(len, &uiov->iov_len))
+			return -EFAULT;
+
+		kiov.iov_base = compat_ptr(ptr);
+		kiov.iov_len = len;
+
+		ret = ptrace_regset(child, request, addr, &kiov);
+		if (!ret)
+			ret = __put_user(kiov.iov_len, &uiov->iov_len);
+		break;
+	}
+#endif
 
 	default:
 		ret = ptrace_request(child, request, addr, data);
@@ -725,10 +838,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 	struct task_struct *child;
 	long ret;
 
-	/*
-	 * This lock_kernel fixes a subtle race with suid exec
-	 */
-	lock_kernel();
 	if (request == PTRACE_TRACEME) {
 		ret = ptrace_traceme();
 		goto out;
@@ -758,7 +867,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
  out_put_task_struct:
 	put_task_struct(child);
  out:
-	unlock_kernel();
 	return ret;
 }
 #endif	/* CONFIG_COMPAT */
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 00000000000..471b66acabb
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,159 @@
+/*
+ * Range add and subtract
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sort.h>
+
+#include <linux/range.h>
+
+int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
+{
+	if (start >= end)
+		return nr_range;
+
+	/* Out of slots: */
+	if (nr_range >= az)
+		return nr_range;
+
+	range[nr_range].start = start;
+	range[nr_range].end = end;
+
+	nr_range++;
+
+	return nr_range;
+}
+
+int add_range_with_merge(struct range *range, int az, int nr_range,
+		     u64 start, u64 end)
+{
+	int i;
+
+	if (start >= end)
+		return nr_range;
+
+	/* Try to merge it with old one: */
+	for (i = 0; i < nr_range; i++) {
+		u64 final_start, final_end;
+		u64 common_start, common_end;
+
+		if (!range[i].end)
+			continue;
+
+		common_start = max(range[i].start, start);
+		common_end = min(range[i].end, end);
+		if (common_start > common_end)
+			continue;
+
+		final_start = min(range[i].start, start);
+		final_end = max(range[i].end, end);
+
+		range[i].start = final_start;
+		range[i].end =  final_end;
+		return nr_range;
+	}
+
+	/* Need to add it: */
+	return add_range(range, az, nr_range, start, end);
+}
+
+void subtract_range(struct range *range, int az, u64 start, u64 end)
+{
+	int i, j;
+
+	if (start >= end)
+		return;
+
+	for (j = 0; j < az; j++) {
+		if (!range[j].end)
+			continue;
+
+		if (start <= range[j].start && end >= range[j].end) {
+			range[j].start = 0;
+			range[j].end = 0;
+			continue;
+		}
+
+		if (start <= range[j].start && end < range[j].end &&
+		    range[j].start < end) {
+			range[j].start = end;
+			continue;
+		}
+
+
+		if (start > range[j].start && end >= range[j].end &&
+		    range[j].end > start) {
+			range[j].end = start;
+			continue;
+		}
+
+		if (start > range[j].start && end < range[j].end) {
+			/* Find the new spare: */
+			for (i = 0; i < az; i++) {
+				if (range[i].end == 0)
+					break;
+			}
+			if (i < az) {
+				range[i].end = range[j].end;
+				range[i].start = end;
+			} else {
+				printk(KERN_ERR "run of slot in ranges\n");
+			}
+			range[j].end = start;
+			continue;
+		}
+	}
+}
+
+static int cmp_range(const void *x1, const void *x2)
+{
+	const struct range *r1 = x1;
+	const struct range *r2 = x2;
+	s64 start1, start2;
+
+	start1 = r1->start;
+	start2 = r2->start;
+
+	return start1 - start2;
+}
+
+int clean_sort_range(struct range *range, int az)
+{
+	int i, j, k = az - 1, nr_range = 0;
+
+	for (i = 0; i < k; i++) {
+		if (range[i].end)
+			continue;
+		for (j = k; j > i; j--) {
+			if (range[j].end) {
+				k = j;
+				break;
+			}
+		}
+		if (j == i)
+			break;
+		range[i].start = range[k].start;
+		range[i].end   = range[k].end;
+		range[k].start = 0;
+		range[k].end   = 0;
+		k--;
+	}
+	/* count it */
+	for (i = 0; i < az; i++) {
+		if (!range[i].end) {
+			nr_range = i;
+			break;
+		}
+	}
+
+	/* sort them */
+	sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+
+	return nr_range;
+}
+
+void sort_range(struct range *range, int nr_range)
+{
+	/* sort them */
+	sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 9b7fd472387..4d169835fb3 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,14 +44,54 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
+#include <linux/hardirq.h>
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
 	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
 EXPORT_SYMBOL_GPL(rcu_lock_map);
+
+static struct lock_class_key rcu_bh_lock_key;
+struct lockdep_map rcu_bh_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
+EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
+
+static struct lock_class_key rcu_sched_lock_key;
+struct lockdep_map rcu_sched_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
+EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
 #endif
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int debug_lockdep_rcu_enabled(void)
+{
+	return rcu_scheduler_active && debug_locks &&
+	       current->lockdep_recursion == 0;
+}
+EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
+
+/**
+ * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ *
+ * Check for bottom half being disabled, which covers both the
+ * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
+ * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
+ * will show the situation.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
+ */
+int rcu_read_lock_bh_held(void)
+{
+	if (!debug_lockdep_rcu_enabled())
+		return 1;
+	return in_softirq();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
  * grace period has elapsed.
@@ -63,3 +103,174 @@ void wakeme_after_rcu(struct rcu_head  *head)
 	rcu = container_of(head, struct rcu_synchronize, head);
 	complete(&rcu->completion);
 }
+
+#ifdef CONFIG_PROVE_RCU
+/*
+ * wrapper function to avoid #include problems.
+ */
+int rcu_my_thread_group_empty(void)
+{
+	return thread_group_empty(current);
+}
+EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+static inline void debug_init_rcu_head(struct rcu_head *head)
+{
+	debug_object_init(head, &rcuhead_debug_descr);
+}
+
+static inline void debug_rcu_head_free(struct rcu_head *head)
+{
+	debug_object_free(head, &rcuhead_debug_descr);
+}
+
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
+{
+	struct rcu_head *head = addr;
+
+	switch (state) {
+	case ODEBUG_STATE_ACTIVE:
+		/*
+		 * Ensure that queued callbacks are all executed.
+		 * If we detect that we are nested in a RCU read-side critical
+		 * section, we should simply fail, otherwise we would deadlock.
+		 */
+		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+		    irqs_disabled()) {
+			WARN_ON(1);
+			return 0;
+		}
+		rcu_barrier();
+		rcu_barrier_sched();
+		rcu_barrier_bh();
+		debug_object_init(head, &rcuhead_debug_descr);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ * Activation is performed internally by call_rcu().
+ */
+static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+{
+	struct rcu_head *head = addr;
+
+	switch (state) {
+
+	case ODEBUG_STATE_NOTAVAILABLE:
+		/*
+		 * This is not really a fixup. We just make sure that it is
+		 * tracked in the object tracker.
+		 */
+		debug_object_init(head, &rcuhead_debug_descr);
+		debug_object_activate(head, &rcuhead_debug_descr);
+		return 0;
+
+	case ODEBUG_STATE_ACTIVE:
+		/*
+		 * Ensure that queued callbacks are all executed.
+		 * If we detect that we are nested in a RCU read-side critical
+		 * section, we should simply fail, otherwise we would deadlock.
+		 */
+		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+		    irqs_disabled()) {
+			WARN_ON(1);
+			return 0;
+		}
+		rcu_barrier();
+		rcu_barrier_sched();
+		rcu_barrier_bh();
+		debug_object_activate(head, &rcuhead_debug_descr);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
+{
+	struct rcu_head *head = addr;
+
+	switch (state) {
+	case ODEBUG_STATE_ACTIVE:
+		/*
+		 * Ensure that queued callbacks are all executed.
+		 * If we detect that we are nested in a RCU read-side critical
+		 * section, we should simply fail, otherwise we would deadlock.
+		 */
+#ifndef CONFIG_PREEMPT
+		WARN_ON(1);
+		return 0;
+#else
+		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+		    irqs_disabled()) {
+			WARN_ON(1);
+			return 0;
+		}
+		rcu_barrier();
+		rcu_barrier_sched();
+		rcu_barrier_bh();
+		debug_object_free(head, &rcuhead_debug_descr);
+		return 1;
+#endif
+	default:
+		return 0;
+	}
+}
+
+/**
+ * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects of a new rcu_head structure that
+ * has been allocated as an auto variable on the stack.  This function
+ * is not required for rcu_head structures that are statically defined or
+ * that are dynamically allocated on the heap.  This function has no
+ * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void init_rcu_head_on_stack(struct rcu_head *head)
+{
+	debug_object_init_on_stack(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
+
+/**
+ * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects that an on-stack rcu_head structure
+ * is about to go out of scope.  As with init_rcu_head_on_stack(), this
+ * function is not required for rcu_head structures that are statically
+ * defined or that are dynamically allocated on the heap.  Also as with
+ * init_rcu_head_on_stack(), this function has no effect for
+ * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void destroy_rcu_head_on_stack(struct rcu_head *head)
+{
+	debug_object_free(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
+
+struct debug_obj_descr rcuhead_debug_descr = {
+	.name = "rcu_head",
+	.fixup_init = rcuhead_fixup_init,
+	.fixup_activate = rcuhead_fixup_activate,
+	.fixup_free = rcuhead_fixup_free,
+};
+EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572..196ec02f8be 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
 };
 
 /* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
-	.donetail	= &rcu_ctrlblk.rcucblist,
-	.curtail	= &rcu_ctrlblk.rcucblist,
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+	.donetail	= &rcu_sched_ctrlblk.rcucblist,
+	.curtail	= &rcu_sched_ctrlblk.rcucblist,
 };
 
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.curtail	= &rcu_bh_ctrlblk.rcucblist,
 };
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 #ifdef CONFIG_NO_HZ
 
 static long rcu_dynticks_nesting = 1;
@@ -108,7 +113,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
  */
 void rcu_sched_qs(int cpu)
 {
-	if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
+	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
+	    rcu_qsctr_help(&rcu_bh_ctrlblk))
 		raise_softirq(RCU_SOFTIRQ);
 }
 
@@ -163,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 	while (list) {
 		next = list->next;
 		prefetch(next);
+		debug_rcu_head_unqueue(list);
 		list->func(list);
 		list = next;
 	}
@@ -173,7 +180,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
  */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
-	__rcu_process_callbacks(&rcu_ctrlblk);
+	__rcu_process_callbacks(&rcu_sched_ctrlblk);
 	__rcu_process_callbacks(&rcu_bh_ctrlblk);
 }
 
@@ -187,7 +194,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  *
  * Cool, huh?  (Due to Josh Triplett.)
  *
- * But we want to make this a static inline later.
+ * But we want to make this a static inline later.  The cond_resched()
+ * currently makes this problematic.
  */
 void synchronize_sched(void)
 {
@@ -195,12 +203,6 @@ void synchronize_sched(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
-void synchronize_rcu_bh(void)
-{
-	synchronize_sched();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
-
 /*
  * Helper function for call_rcu() and call_rcu_bh().
  */
@@ -210,6 +212,7 @@ static void __call_rcu(struct rcu_head *head,
 {
 	unsigned long flags;
 
+	debug_rcu_head_queue(head);
 	head->func = func;
 	head->next = NULL;
 
@@ -226,7 +229,7 @@ static void __call_rcu(struct rcu_head *head,
  */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-	__call_rcu(head, func, &rcu_ctrlblk);
+	__call_rcu(head, func, &rcu_sched_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
@@ -244,11 +247,13 @@ void rcu_barrier(void)
 {
 	struct rcu_synchronize rcu;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
@@ -256,11 +261,13 @@ void rcu_barrier_bh(void)
 {
 	struct rcu_synchronize rcu;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_bh(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
 
@@ -268,11 +275,13 @@ void rcu_barrier_sched(void)
 {
 	struct rcu_synchronize rcu;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_sched(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
@@ -280,3 +289,5 @@ void __init rcu_init(void)
 {
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
+
+#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 00000000000..d223a92bc74
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptable semantics.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2009
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+#include <linux/kernel_stat.h>
+
+/*
+ * During boot, we forgive RCU lockdep issues.  After this function is
+ * invoked, we start taking RCU lockdep issues seriously.
+ */
+void rcu_scheduler_starting(void)
+{
+	WARN_ON(nr_context_switches() > 0);
+	rcu_scheduler_active = 1;
+}
+
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9bb52177af0..2e2726d790b 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,6 +61,9 @@ static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
 static int stutter = 5;		/* Start/stop testing interval (in sec) */
 static int irqreader = 1;	/* RCU readers from irq (timers). */
+static int fqs_duration = 0;	/* Duration of bursts (us), 0 to disable. */
+static int fqs_holdoff = 0;	/* Hold time within burst (us). */
+static int fqs_stutter = 3;	/* Wait time between bursts (s). */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 
 module_param(nreaders, int, 0444);
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444);
 MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
 module_param(irqreader, int, 0444);
 MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
+module_param(fqs_duration, int, 0444);
+MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
+module_param(fqs_holdoff, int, 0444);
+MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
+module_param(fqs_stutter, int, 0444);
+MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
 
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
+static struct task_struct *fqs_task;
 
 #define RCU_TORTURE_PIPE_LEN 10
 
@@ -229,8 +239,7 @@ static unsigned long
 rcu_random(struct rcu_random_state *rrsp)
 {
 	if (--rrsp->rrs_count < 0) {
-		rrsp->rrs_state +=
-			(unsigned long)cpu_clock(raw_smp_processor_id());
+		rrsp->rrs_state += (unsigned long)local_clock();
 		rrsp->rrs_count = RCU_RANDOM_REFRESH;
 	}
 	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
@@ -263,6 +272,7 @@ struct rcu_torture_ops {
 	void (*deferred_free)(struct rcu_torture *p);
 	void (*sync)(void);
 	void (*cb_barrier)(void);
+	void (*fqs)(void);
 	int (*stats)(char *page);
 	int irq_capable;
 	char *name;
@@ -347,6 +357,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.deferred_free	= rcu_torture_deferred_free,
 	.sync		= synchronize_rcu,
 	.cb_barrier	= rcu_barrier,
+	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "rcu"
@@ -388,6 +399,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= synchronize_rcu,
 	.cb_barrier	= NULL,
+	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "rcu_sync"
@@ -403,6 +415,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= synchronize_rcu_expedited,
 	.cb_barrier	= NULL,
+	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "rcu_expedited"
@@ -450,9 +463,11 @@ static void rcu_bh_torture_synchronize(void)
 {
 	struct rcu_bh_torture_synchronize rcu;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 
 static struct rcu_torture_ops rcu_bh_ops = {
@@ -465,6 +480,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.deferred_free	= rcu_bh_torture_deferred_free,
 	.sync		= rcu_bh_torture_synchronize,
 	.cb_barrier	= rcu_barrier_bh,
+	.fqs		= rcu_bh_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "rcu_bh"
@@ -480,6 +496,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= rcu_bh_torture_synchronize,
 	.cb_barrier	= NULL,
+	.fqs		= rcu_bh_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "rcu_bh_sync"
@@ -621,6 +638,7 @@ static struct rcu_torture_ops sched_ops = {
 	.deferred_free	= rcu_sched_torture_deferred_free,
 	.sync		= sched_torture_synchronize,
 	.cb_barrier	= rcu_barrier_sched,
+	.fqs		= rcu_sched_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "sched"
@@ -636,6 +654,7 @@ static struct rcu_torture_ops sched_sync_ops = {
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= sched_torture_synchronize,
 	.cb_barrier	= NULL,
+	.fqs		= rcu_sched_force_quiescent_state,
 	.stats		= NULL,
 	.name		= "sched_sync"
 };
@@ -650,12 +669,45 @@ static struct rcu_torture_ops sched_expedited_ops = {
 	.deferred_free	= rcu_sync_torture_deferred_free,
 	.sync		= synchronize_sched_expedited,
 	.cb_barrier	= NULL,
-	.stats		= rcu_expedited_torture_stats,
+	.fqs		= rcu_sched_force_quiescent_state,
+	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "sched_expedited"
 };
 
 /*
+ * RCU torture force-quiescent-state kthread.  Repeatedly induces
+ * bursts of calls to force_quiescent_state(), increasing the probability
+ * of occurrence of some important types of race conditions.
+ */
+static int
+rcu_torture_fqs(void *arg)
+{
+	unsigned long fqs_resume_time;
+	int fqs_burst_remaining;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
+	do {
+		fqs_resume_time = jiffies + fqs_stutter * HZ;
+		while (jiffies - fqs_resume_time > LONG_MAX) {
+			schedule_timeout_interruptible(1);
+		}
+		fqs_burst_remaining = fqs_duration;
+		while (fqs_burst_remaining > 0) {
+			cur_ops->fqs();
+			udelay(fqs_holdoff);
+			fqs_burst_remaining -= fqs_holdoff;
+		}
+		rcu_stutter_wait("rcu_torture_fqs");
+	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_fqs");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
  * RCU torture writer kthread.  Repeatedly substitutes a new structure
  * for that pointed to by rcu_torture_current, freeing the old structure
  * after a series of grace periods (the "pipeline").
@@ -745,7 +797,11 @@ static void rcu_torture_timer(unsigned long unused)
 
 	idx = cur_ops->readlock();
 	completed = cur_ops->completed();
-	p = rcu_dereference(rcu_torture_current);
+	p = rcu_dereference_check(rcu_torture_current,
+				  rcu_read_lock_held() ||
+				  rcu_read_lock_bh_held() ||
+				  rcu_read_lock_sched_held() ||
+				  srcu_read_lock_held(&srcu_ctl));
 	if (p == NULL) {
 		/* Leave because rcu_torture_writer is not yet underway */
 		cur_ops->readunlock(idx);
@@ -763,13 +819,13 @@ static void rcu_torture_timer(unsigned long unused)
 		/* Should not happen, but... */
 		pipe_count = RCU_TORTURE_PIPE_LEN;
 	}
-	__this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
+	__this_cpu_inc(rcu_torture_count[pipe_count]);
 	completed = cur_ops->completed() - completed;
 	if (completed > RCU_TORTURE_PIPE_LEN) {
 		/* Should not happen, but... */
 		completed = RCU_TORTURE_PIPE_LEN;
 	}
-	__this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
+	__this_cpu_inc(rcu_torture_batch[completed]);
 	preempt_enable();
 	cur_ops->readunlock(idx);
 }
@@ -798,11 +854,15 @@ rcu_torture_reader(void *arg)
 	do {
 		if (irqreader && cur_ops->irq_capable) {
 			if (!timer_pending(&t))
-				mod_timer(&t, 1);
+				mod_timer(&t, jiffies + 1);
 		}
 		idx = cur_ops->readlock();
 		completed = cur_ops->completed();
-		p = rcu_dereference(rcu_torture_current);
+		p = rcu_dereference_check(rcu_torture_current,
+					  rcu_read_lock_held() ||
+					  rcu_read_lock_bh_held() ||
+					  rcu_read_lock_sched_held() ||
+					  srcu_read_lock_held(&srcu_ctl));
 		if (p == NULL) {
 			/* Wait for rcu_torture_writer to get underway */
 			cur_ops->readunlock(idx);
@@ -818,13 +878,13 @@ rcu_torture_reader(void *arg)
 			/* Should not happen, but... */
 			pipe_count = RCU_TORTURE_PIPE_LEN;
 		}
-		__this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
+		__this_cpu_inc(rcu_torture_count[pipe_count]);
 		completed = cur_ops->completed() - completed;
 		if (completed > RCU_TORTURE_PIPE_LEN) {
 			/* Should not happen, but... */
 			completed = RCU_TORTURE_PIPE_LEN;
 		}
-		__this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
+		__this_cpu_inc(rcu_torture_batch[completed]);
 		preempt_enable();
 		cur_ops->readunlock(idx);
 		schedule();
@@ -1030,10 +1090,11 @@ rcu_torture_print_module_parms(char *tag)
 	printk(KERN_ALERT "%s" TORTURE_FLAG
 		"--- %s: nreaders=%d nfakewriters=%d "
 		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
-		"shuffle_interval=%d stutter=%d irqreader=%d\n",
+		"shuffle_interval=%d stutter=%d irqreader=%d "
+		"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
 		torture_type, tag, nrealreaders, nfakewriters,
 		stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-		stutter, irqreader);
+		stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
 }
 
 static struct notifier_block rcutorture_nb = {
@@ -1109,6 +1170,12 @@ rcu_torture_cleanup(void)
 	}
 	stats_task = NULL;
 
+	if (fqs_task) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
+		kthread_stop(fqs_task);
+	}
+	fqs_task = NULL;
+
 	/* Wait for all RCU callbacks to fire.  */
 
 	if (cur_ops->cb_barrier != NULL)
@@ -1154,6 +1221,11 @@ rcu_torture_init(void)
 		mutex_unlock(&fullstop_mutex);
 		return -EINVAL;
 	}
+	if (cur_ops->fqs == NULL && fqs_duration != 0) {
+		printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
+				  "fqs_duration, fqs disabled.\n");
+		fqs_duration = 0;
+	}
 	if (cur_ops->init)
 		cur_ops->init(); /* no "goto unwind" prior to this point!!! */
 
@@ -1282,6 +1354,19 @@ rcu_torture_init(void)
 			goto unwind;
 		}
 	}
+	if (fqs_duration < 0)
+		fqs_duration = 0;
+	if (fqs_duration) {
+		/* Create the stutter thread */
+		fqs_task = kthread_run(rcu_torture_fqs, NULL,
+				       "rcu_torture_fqs");
+		if (IS_ERR(fqs_task)) {
+			firsterr = PTR_ERR(fqs_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
+			fqs_task = NULL;
+			goto unwind;
+		}
+	}
 	register_reboot_notifier(&rcutorture_nb);
 	mutex_unlock(&fullstop_mutex);
 	return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 53ae9598f79..d5bc43976c5 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -54,8 +54,8 @@
 
 static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 
-#define RCU_STATE_INITIALIZER(name) { \
-	.level = { &name.node[0] }, \
+#define RCU_STATE_INITIALIZER(structname) { \
+	.level = { &structname.node[0] }, \
 	.levelcnt = { \
 		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
 		NUM_RCU_LVL_1, \
@@ -66,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 	.signaled = RCU_GP_IDLE, \
 	.gpnum = -300, \
 	.completed = -300, \
-	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
 	.orphan_cbs_list = NULL, \
-	.orphan_cbs_tail = &name.orphan_cbs_list, \
+	.orphan_cbs_tail = &structname.orphan_cbs_list, \
 	.orphan_qlen = 0, \
-	.fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
+	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
+	.name = #structname, \
 }
 
 struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -81,8 +82,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
-static int rcu_scheduler_active __read_mostly;
-
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
@@ -101,25 +102,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(int cpu)
 {
-	struct rcu_data *rdp;
+	struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
 
-	rdp = &per_cpu(rcu_sched_data, cpu);
 	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
 	rdp->passed_quiesc = 1;
-	rcu_preempt_note_context_switch(cpu);
 }
 
 void rcu_bh_qs(int cpu)
 {
-	struct rcu_data *rdp;
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
 
-	rdp = &per_cpu(rcu_bh_data, cpu);
 	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
 	rdp->passed_quiesc = 1;
 }
 
+/*
+ * Note a context switch.  This is a quiescent state for RCU-sched,
+ * and requires special handling for preemptible RCU.
+ */
+void rcu_note_context_switch(int cpu)
+{
+	rcu_sched_qs(cpu);
+	rcu_preempt_note_context_switch(cpu);
+}
+
 #ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 	.dynticks_nesting = 1,
@@ -157,6 +165,24 @@ long rcu_batches_completed_bh(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 
 /*
+ * Force a quiescent state for RCU BH.
+ */
+void rcu_bh_force_quiescent_state(void)
+{
+	force_quiescent_state(&rcu_bh_state, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
+
+/*
+ * Force a quiescent state for RCU-sched.
+ */
+void rcu_sched_force_quiescent_state(void)
+{
+	force_quiescent_state(&rcu_sched_state, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+
+/*
  * Does the CPU have callbacks ready to be invoked?
  */
 static int
@@ -424,6 +450,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 
+int rcu_cpu_stall_panicking __read_mostly;
+
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
 	rsp->gp_start = jiffies;
@@ -439,10 +467,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 
 	/* Only let one CPU complain about others per time interval. */
 
-	spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave(&rnp->lock, flags);
 	delta = jiffies - rsp->jiffies_stall;
 	if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
-		spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
 	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
@@ -452,23 +480,30 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 	 * due to CPU offlining.
 	 */
 	rcu_print_task_stall(rnp);
-	spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	/* OK, time to rat on our buddy... */
 
-	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
+	printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
+	       rsp->name);
 	rcu_for_each_leaf_node(rsp, rnp) {
+		raw_spin_lock_irqsave(&rnp->lock, flags);
 		rcu_print_task_stall(rnp);
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		if (rnp->qsmask == 0)
 			continue;
 		for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
 			if (rnp->qsmask & (1UL << cpu))
 				printk(" %d", rnp->grplo + cpu);
 	}
-	printk(" (detected by %d, t=%ld jiffies)\n",
+	printk("} (detected by %d, t=%ld jiffies)\n",
 	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
 	trigger_all_cpu_backtrace();
 
+	/* If so configured, complain about tasks blocking the grace period. */
+
+	rcu_print_detail_task_stall(rsp);
+
 	force_quiescent_state(rsp, 0);  /* Kick them all. */
 }
 
@@ -477,15 +512,15 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	unsigned long flags;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
-			smp_processor_id(), jiffies - rsp->gp_start);
+	printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
+	       rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
 	trigger_all_cpu_backtrace();
 
-	spin_lock_irqsave(&rnp->lock, flags);
-	if ((long)(jiffies - rsp->jiffies_stall) >= 0)
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+	if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
 		rsp->jiffies_stall =
 			jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
-	spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	set_need_resched();  /* kick ourselves to get things going. */
 }
@@ -495,6 +530,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	long delta;
 	struct rcu_node *rnp;
 
+	if (rcu_cpu_stall_panicking)
+		return;
 	delta = jiffies - rsp->jiffies_stall;
 	rnp = rdp->mynode;
 	if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -509,6 +546,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 }
 
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+	rcu_cpu_stall_panicking = 1;
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+	.notifier_call = rcu_panic,
+};
+
+static void __init check_cpu_stall_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+}
+
 #else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
 static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -519,6 +571,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 }
 
+static void __init check_cpu_stall_init(void)
+{
+}
+
 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
 /*
@@ -545,12 +601,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
 	local_irq_save(flags);
 	rnp = rdp->mynode;
 	if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
-	    !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
+	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
 		local_irq_restore(flags);
 		return;
 	}
 	__note_new_gpnum(rsp, rnp, rdp);
-	spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
 /*
@@ -609,12 +665,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
 	local_irq_save(flags);
 	rnp = rdp->mynode;
 	if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
-	    !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
+	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
 		local_irq_restore(flags);
 		return;
 	}
 	__rcu_process_gp_end(rsp, rnp, rdp);
-	spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
 /*
@@ -659,12 +715,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	if (!cpu_needs_another_gp(rsp, rdp)) {
+	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
+		if (cpu_needs_another_gp(rsp, rdp))
+			rsp->fqs_need_gp = 1;
 		if (rnp->completed == rsp->completed) {
-			spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			return;
 		}
-		spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
+		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
 
 		/*
 		 * Propagate new ->completed value to rcu_node structures
@@ -672,9 +730,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 		 * of the next grace period to process their callbacks.
 		 */
 		rcu_for_each_node_breadth_first(rsp, rnp) {
-			spin_lock(&rnp->lock);	 /* irqs already disabled. */
+			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 			rnp->completed = rsp->completed;
-			spin_unlock(&rnp->lock); /* irqs remain disabled. */
+			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
 		local_irq_restore(flags);
 		return;
@@ -695,15 +753,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 		rnp->completed = rsp->completed;
 		rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
 		rcu_start_gp_per_cpu(rsp, rnp, rdp);
-		spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
 
-	spin_unlock(&rnp->lock);  /* leave irqs disabled. */
+	raw_spin_unlock(&rnp->lock);  /* leave irqs disabled. */
 
 
 	/* Exclude any concurrent CPU-hotplug operations. */
-	spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+	raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
 
 	/*
 	 * Set the quiescent-state-needed bits in all the rcu_node
@@ -723,21 +781,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	 * irqs disabled.
 	 */
 	rcu_for_each_node_breadth_first(rsp, rnp) {
-		spin_lock(&rnp->lock);		/* irqs already disabled. */
+		raw_spin_lock(&rnp->lock);	/* irqs already disabled. */
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
 		rnp->gpnum = rsp->gpnum;
 		rnp->completed = rsp->completed;
 		if (rnp == rdp->mynode)
 			rcu_start_gp_per_cpu(rsp, rnp, rdp);
-		spin_unlock(&rnp->lock);	/* irqs remain disabled. */
+		raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 	}
 
 	rnp = rcu_get_root(rsp);
-	spin_lock(&rnp->lock);			/* irqs already disabled. */
+	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
 	rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
-	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
-	spin_unlock_irqrestore(&rsp->onofflock, flags);
+	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
 /*
@@ -776,14 +834,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 		if (!(rnp->qsmask & mask)) {
 
 			/* Our bit has already been cleared, so done. */
-			spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			return;
 		}
 		rnp->qsmask &= ~mask;
 		if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
 
 			/* Other bits still set at this level, so done. */
-			spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			return;
 		}
 		mask = rnp->grpmask;
@@ -793,10 +851,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 
 			break;
 		}
-		spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		rnp_c = rnp;
 		rnp = rnp->parent;
-		spin_lock_irqsave(&rnp->lock, flags);
+		raw_spin_lock_irqsave(&rnp->lock, flags);
 		WARN_ON_ONCE(rnp_c->qsmask);
 	}
 
@@ -825,7 +883,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
 	struct rcu_node *rnp;
 
 	rnp = rdp->mynode;
-	spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave(&rnp->lock, flags);
 	if (lastcomp != rnp->completed) {
 
 		/*
@@ -837,12 +895,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
 		 * race occurred.
 		 */
 		rdp->passed_quiesc = 0;	/* try again later! */
-		spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
 	mask = rdp->grpmask;
 	if ((rnp->qsmask & mask) == 0) {
-		spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	} else {
 		rdp->qs_pending = 0;
 
@@ -906,7 +964,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
 
 	if (rdp->nxtlist == NULL)
 		return;  /* irqs disabled, so comparison is stable. */
-	spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+	raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
 	*rsp->orphan_cbs_tail = rdp->nxtlist;
 	rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
 	rdp->nxtlist = NULL;
@@ -914,7 +972,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
 		rdp->nxttail[i] = &rdp->nxtlist;
 	rsp->orphan_qlen += rdp->qlen;
 	rdp->qlen = 0;
-	spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
+	raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
 }
 
 /*
@@ -925,10 +983,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 	unsigned long flags;
 	struct rcu_data *rdp;
 
-	spin_lock_irqsave(&rsp->onofflock, flags);
+	raw_spin_lock_irqsave(&rsp->onofflock, flags);
 	rdp = rsp->rda[smp_processor_id()];
 	if (rsp->orphan_cbs_list == NULL) {
-		spin_unlock_irqrestore(&rsp->onofflock, flags);
+		raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 		return;
 	}
 	*rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
@@ -937,7 +995,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 	rsp->orphan_cbs_list = NULL;
 	rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
 	rsp->orphan_qlen = 0;
-	spin_unlock_irqrestore(&rsp->onofflock, flags);
+	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
 /*
@@ -953,23 +1011,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 	struct rcu_node *rnp;
 
 	/* Exclude any attempts to start a new grace period. */
-	spin_lock_irqsave(&rsp->onofflock, flags);
+	raw_spin_lock_irqsave(&rsp->onofflock, flags);
 
 	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
 	rnp = rdp->mynode;	/* this is the outgoing CPU's rnp. */
 	mask = rdp->grpmask;	/* rnp->grplo is constant. */
 	do {
-		spin_lock(&rnp->lock);		/* irqs already disabled. */
+		raw_spin_lock(&rnp->lock);	/* irqs already disabled. */
 		rnp->qsmaskinit &= ~mask;
 		if (rnp->qsmaskinit != 0) {
 			if (rnp != rdp->mynode)
-				spin_unlock(&rnp->lock); /* irqs remain disabled. */
+				raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 			break;
 		}
 		if (rnp == rdp->mynode)
 			need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
 		else
-			spin_unlock(&rnp->lock); /* irqs remain disabled. */
+			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		mask = rnp->grpmask;
 		rnp = rnp->parent;
 	} while (rnp != NULL);
@@ -980,12 +1038,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 	 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
 	 * held leads to deadlock.
 	 */
-	spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
+	raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
 	rnp = rdp->mynode;
 	if (need_report & RCU_OFL_TASKS_NORM_GP)
 		rcu_report_unblock_qs_rnp(rnp, flags);
 	else
-		spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	if (need_report & RCU_OFL_TASKS_EXP_GP)
 		rcu_report_exp_rnp(rsp, rnp);
 
@@ -1054,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	while (list) {
 		next = list->next;
 		prefetch(next);
+		debug_rcu_head_unqueue(list);
 		list->func(list);
 		list = next;
 		if (++count >= rdp->blimit)
@@ -1103,8 +1162,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
  */
 void rcu_check_callbacks(int cpu, int user)
 {
-	if (!rcu_pending(cpu))
-		return; /* if nothing for RCU to do. */
 	if (user ||
 	    (idle_cpu(cpu) && rcu_scheduler_active &&
 	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1136,7 +1193,8 @@ void rcu_check_callbacks(int cpu, int user)
 		rcu_bh_qs(cpu);
 	}
 	rcu_preempt_check_callbacks(cpu);
-	raise_softirq(RCU_SOFTIRQ);
+	if (rcu_pending(cpu))
+		raise_softirq(RCU_SOFTIRQ);
 }
 
 #ifdef CONFIG_SMP
@@ -1144,11 +1202,9 @@ void rcu_check_callbacks(int cpu, int user)
 /*
  * Scan the leaf rcu_node structures, processing dyntick state for any that
  * have not yet encountered a quiescent state, using the function specified.
- * Returns 1 if the current grace period ends while scanning (possibly
- * because we made it end).
+ * The caller must have suppressed start of new grace periods.
  */
-static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
-			       int (*f)(struct rcu_data *))
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
 {
 	unsigned long bit;
 	int cpu;
@@ -1158,13 +1214,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
 
 	rcu_for_each_leaf_node(rsp, rnp) {
 		mask = 0;
-		spin_lock_irqsave(&rnp->lock, flags);
-		if (rnp->completed != lastcomp) {
-			spin_unlock_irqrestore(&rnp->lock, flags);
-			return 1;
+		raw_spin_lock_irqsave(&rnp->lock, flags);
+		if (!rcu_gp_in_progress(rsp)) {
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
 		}
 		if (rnp->qsmask == 0) {
-			spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			continue;
 		}
 		cpu = rnp->grplo;
@@ -1173,15 +1229,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
 			if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
 				mask |= bit;
 		}
-		if (mask != 0 && rnp->completed == lastcomp) {
+		if (mask != 0) {
 
 			/* rcu_report_qs_rnp() releases rnp->lock. */
 			rcu_report_qs_rnp(mask, rsp, rnp, flags);
 			continue;
 		}
-		spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	}
-	return 0;
 }
 
 /*
@@ -1191,78 +1246,65 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 {
 	unsigned long flags;
-	long lastcomp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
-	u8 signaled;
-	u8 forcenow;
 
 	if (!rcu_gp_in_progress(rsp))
 		return;  /* No grace period in progress, nothing to force. */
-	if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
+	if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
 		rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
 		return;	/* Someone else is already on the job. */
 	}
-	if (relaxed &&
-	    (long)(rsp->jiffies_force_qs - jiffies) >= 0)
-		goto unlock_ret; /* no emergency and done recently. */
+	if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
+		goto unlock_fqs_ret; /* no emergency and done recently. */
 	rsp->n_force_qs++;
-	spin_lock(&rnp->lock);
-	lastcomp = rsp->gpnum - 1;
-	signaled = rsp->signaled;
+	raw_spin_lock(&rnp->lock);  /* irqs already disabled */
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
 	if(!rcu_gp_in_progress(rsp)) {
 		rsp->n_force_qs_ngp++;
-		spin_unlock(&rnp->lock);
-		goto unlock_ret;  /* no GP in progress, time updated. */
+		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
+		goto unlock_fqs_ret;  /* no GP in progress, time updated. */
 	}
-	spin_unlock(&rnp->lock);
-	switch (signaled) {
+	rsp->fqs_active = 1;
+	switch (rsp->signaled) {
 	case RCU_GP_IDLE:
 	case RCU_GP_INIT:
 
 		break; /* grace period idle or initializing, ignore. */
 
 	case RCU_SAVE_DYNTICK:
-
 		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
 			break; /* So gcc recognizes the dead code. */
 
+		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
+
 		/* Record dyntick-idle state. */
-		if (rcu_process_dyntick(rsp, lastcomp,
-					dyntick_save_progress_counter))
-			goto unlock_ret;
-		/* fall into next case. */
-
-	case RCU_SAVE_COMPLETED:
-
-		/* Update state, record completion counter. */
-		forcenow = 0;
-		spin_lock(&rnp->lock);
-		if (lastcomp + 1 == rsp->gpnum &&
-		    lastcomp == rsp->completed &&
-		    rsp->signaled == signaled) {
+		force_qs_rnp(rsp, dyntick_save_progress_counter);
+		raw_spin_lock(&rnp->lock);  /* irqs already disabled */
+		if (rcu_gp_in_progress(rsp))
 			rsp->signaled = RCU_FORCE_QS;
-			rsp->completed_fqs = lastcomp;
-			forcenow = signaled == RCU_SAVE_COMPLETED;
-		}
-		spin_unlock(&rnp->lock);
-		if (!forcenow)
-			break;
-		/* fall into next case. */
+		break;
 
 	case RCU_FORCE_QS:
 
 		/* Check dyntick-idle state, send IPI to laggarts. */
-		if (rcu_process_dyntick(rsp, rsp->completed_fqs,
-					rcu_implicit_dynticks_qs))
-			goto unlock_ret;
+		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
+		force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
 
 		/* Leave state in case more forcing is required. */
 
+		raw_spin_lock(&rnp->lock);  /* irqs already disabled */
 		break;
 	}
-unlock_ret:
-	spin_unlock_irqrestore(&rsp->fqslock, flags);
+	rsp->fqs_active = 0;
+	if (rsp->fqs_need_gp) {
+		raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
+		rsp->fqs_need_gp = 0;
+		rcu_start_gp(rsp, flags); /* releases rnp->lock */
+		return;
+	}
+	raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
+unlock_fqs_ret:
+	raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
 }
 
 #else /* #ifdef CONFIG_SMP */
@@ -1290,7 +1332,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * If an RCU GP has gone long enough, go check for dyntick
 	 * idle CPUs and, if needed, send resched IPIs.
 	 */
-	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+	if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
 		force_quiescent_state(rsp, 1);
 
 	/*
@@ -1304,7 +1346,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Does this CPU require a not-yet-started grace period? */
 	if (cpu_needs_another_gp(rsp, rdp)) {
-		spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
 		rcu_start_gp(rsp, flags);  /* releases above lock */
 	}
 
@@ -1335,6 +1377,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 	 * grace-period manipulations above.
 	 */
 	smp_mb(); /* See above block comment. */
+
+	/* If we are last CPU on way to dyntick-idle mode, accelerate it. */
+	rcu_needs_cpu_flush();
 }
 
 static void
@@ -1344,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	unsigned long flags;
 	struct rcu_data *rdp;
 
+	debug_rcu_head_queue(head);
 	head->func = func;
 	head->next = NULL;
 
@@ -1369,7 +1415,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 		unsigned long nestflag;
 		struct rcu_node *rnp_root = rcu_get_root(rsp);
 
-		spin_lock_irqsave(&rnp_root->lock, nestflag);
+		raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
 		rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
 	}
 
@@ -1387,7 +1433,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 			force_quiescent_state(rsp, 0);
 		rdp->n_force_qs_snap = rsp->n_force_qs;
 		rdp->qlen_last_fqs_check = rdp->qlen;
-	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+	} else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
 		force_quiescent_state(rsp, 1);
 	local_irq_restore(flags);
 }
@@ -1440,11 +1486,13 @@ void synchronize_sched(void)
 	if (rcu_blocking_is_gp())
 		return;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_sched(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
@@ -1464,11 +1512,13 @@ void synchronize_rcu_bh(void)
 	if (rcu_blocking_is_gp())
 		return;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_bh(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 
@@ -1489,8 +1539,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	check_cpu_stall(rsp, rdp);
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
-	if (rdp->qs_pending) {
+	if (rdp->qs_pending && !rdp->passed_quiesc) {
+
+		/*
+		 * If force_quiescent_state() coming soon and this CPU
+		 * needs a quiescent state, and this is either RCU-sched
+		 * or RCU-bh, force a local reschedule.
+		 */
 		rdp->n_rp_qs_pending++;
+		if (!rdp->preemptable &&
+		    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
+				 jiffies))
+			set_need_resched();
+	} else if (rdp->qs_pending && rdp->passed_quiesc) {
+		rdp->n_rp_report_qs++;
 		return 1;
 	}
 
@@ -1520,7 +1582,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Has an RCU GP gone long enough to send resched IPIs &c? */
 	if (rcu_gp_in_progress(rsp) &&
-	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
+	    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
 		rdp->n_rp_need_fqs++;
 		return 1;
 	}
@@ -1545,10 +1607,9 @@ static int rcu_pending(int cpu)
 /*
  * Check to see if any future RCU-related work will need to be done
  * by the current CPU, even if none need be done immediately, returning
- * 1 if so.  This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
+ * 1 if so.
  */
-int rcu_needs_cpu(int cpu)
+static int rcu_needs_cpu_quick_check(int cpu)
 {
 	/* RCU callbacks either ready or pending? */
 	return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1556,21 +1617,6 @@ int rcu_needs_cpu(int cpu)
 	       rcu_preempt_needs_cpu(cpu);
 }
 
-/*
- * This function is invoked towards the end of the scheduler's initialization
- * process.  Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system).  After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections.
- */
-void rcu_scheduler_starting(void)
-{
-	WARN_ON(num_online_cpus() != 1);
-	WARN_ON(nr_context_switches() > 0);
-	rcu_scheduler_active = 1;
-}
-
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1659,7 +1705,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Set up local state, ensuring consistent view of global state. */
-	spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave(&rnp->lock, flags);
 	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
 	rdp->nxtlist = NULL;
 	for (i = 0; i < RCU_NEXT_SIZE; i++)
@@ -1669,7 +1715,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 #endif /* #ifdef CONFIG_NO_HZ */
 	rdp->cpu = cpu;
-	spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
 /*
@@ -1687,7 +1733,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Set up local state, ensuring consistent view of global state. */
-	spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave(&rnp->lock, flags);
 	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
 	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
 	rdp->beenonline = 1;	 /* We have now been online. */
@@ -1695,7 +1741,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
 	rdp->qlen_last_fqs_check = 0;
 	rdp->n_force_qs_snap = rsp->n_force_qs;
 	rdp->blimit = blimit;
-	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
 
 	/*
 	 * A new grace period might start here.  If so, we won't be part
@@ -1703,14 +1749,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
 	 */
 
 	/* Exclude any attempts to start a new GP on large systems. */
-	spin_lock(&rsp->onofflock);		/* irqs already disabled. */
+	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
 
 	/* Add CPU to rcu_node bitmasks. */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
 	do {
 		/* Exclude any attempts to start a new GP on small systems. */
-		spin_lock(&rnp->lock);	/* irqs already disabled. */
+		raw_spin_lock(&rnp->lock);	/* irqs already disabled. */
 		rnp->qsmaskinit |= mask;
 		mask = rnp->grpmask;
 		if (rnp == rdp->mynode) {
@@ -1718,11 +1764,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
 			rdp->completed = rnp->completed;
 			rdp->passed_quiesc_completed = rnp->completed - 1;
 		}
-		spin_unlock(&rnp->lock); /* irqs already disabled. */
+		raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
 		rnp = rnp->parent;
 	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
 
-	spin_unlock_irqrestore(&rsp->onofflock, flags);
+	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
 static void __cpuinit rcu_online_cpu(int cpu)
@@ -1774,6 +1820,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 }
 
 /*
+ * This function is invoked towards the end of the scheduler's initialization
+ * process.  Before this is called, the idle task might contain
+ * RCU read-side critical sections (during which time, this idle
+ * task is booting the system).  After this function is called, the
+ * idle tasks are prohibited from containing RCU read-side critical
+ * sections.  This function also enables RCU lockdep checking.
+ */
+void rcu_scheduler_starting(void)
+{
+	WARN_ON(num_online_cpus() != 1);
+	WARN_ON(nr_context_switches() > 0);
+	rcu_scheduler_active = 1;
+}
+
+/*
  * Compute the per-level fanout, either using the exact fanout specified
  * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
  */
@@ -1806,11 +1867,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
  */
 static void __init rcu_init_one(struct rcu_state *rsp)
 {
+	static char *buf[] = { "rcu_node_level_0",
+			       "rcu_node_level_1",
+			       "rcu_node_level_2",
+			       "rcu_node_level_3" };  /* Match MAX_RCU_LVLS */
 	int cpustride = 1;
 	int i;
 	int j;
 	struct rcu_node *rnp;
 
+	BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
+
 	/* Initialize the level-tracking arrays. */
 
 	for (i = 1; i < NUM_RCU_LVLS; i++)
@@ -1823,8 +1890,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 		cpustride *= rsp->levelspread[i];
 		rnp = rsp->level[i];
 		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
-			spin_lock_init(&rnp->lock);
-			lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
+			raw_spin_lock_init(&rnp->lock);
+			lockdep_set_class_and_name(&rnp->lock,
+						   &rcu_node_class[i], buf[i]);
 			rnp->gpnum = 0;
 			rnp->qsmask = 0;
 			rnp->qsmaskinit = 0;
@@ -1849,6 +1917,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 			INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
 		}
 	}
+
+	rnp = rsp->level[NUM_RCU_LVLS - 1];
+	for_each_possible_cpu(i) {
+		while (i > rnp->grphi)
+			rnp++;
+		rsp->rda[i]->mynode = rnp;
+		rcu_boot_init_percpu_data(i, rsp);
+	}
 }
 
 /*
@@ -1859,32 +1935,18 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 #define RCU_INIT_FLAVOR(rsp, rcu_data) \
 do { \
 	int i; \
-	int j; \
-	struct rcu_node *rnp; \
 	\
-	rcu_init_one(rsp); \
-	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
-	j = 0; \
 	for_each_possible_cpu(i) { \
-		if (i > rnp[j].grphi) \
-			j++; \
-		per_cpu(rcu_data, i).mynode = &rnp[j]; \
 		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
-		rcu_boot_init_percpu_data(i, rsp); \
 	} \
+	rcu_init_one(rsp); \
 } while (0)
 
 void __init rcu_init(void)
 {
-	int i;
+	int cpu;
 
 	rcu_bootup_announce();
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-#if NUM_RCU_LVL_4 != 0
-	printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
-#endif /* #if NUM_RCU_LVL_4 != 0 */
 	RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
 	RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
 	__rcu_init_preempt();
@@ -1896,8 +1958,9 @@ void __init rcu_init(void)
 	 * or the scheduler are operational.
 	 */
 	cpu_notifier(rcu_cpu_notify, 0);
-	for_each_online_cpu(i)
-		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);
+	for_each_online_cpu(cpu)
+		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+	check_cpu_stall_init();
 }
 
 #include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index d2a0046f63b..14c040b18ed 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -90,12 +90,12 @@ struct rcu_dynticks {
  * Definition for node within the RCU grace-period-detection hierarchy.
  */
 struct rcu_node {
-	spinlock_t lock;	/* Root rcu_node's lock protects some */
+	raw_spinlock_t lock;	/* Root rcu_node's lock protects some */
 				/*  rcu_state fields as well as following. */
-	long	gpnum;		/* Current grace period for this node. */
+	unsigned long gpnum;	/* Current grace period for this node. */
 				/*  This will either be equal to or one */
 				/*  behind the root rcu_node's gpnum. */
-	long	completed;	/* Last grace period completed for this node. */
+	unsigned long completed; /* Last GP completed for this node. */
 				/*  This will either be equal to or one */
 				/*  behind the root rcu_node's gpnum. */
 	unsigned long qsmask;	/* CPUs or groups that need to switch in */
@@ -161,11 +161,11 @@ struct rcu_node {
 /* Per-CPU data for read-copy update. */
 struct rcu_data {
 	/* 1) quiescent-state and grace-period handling : */
-	long		completed;	/* Track rsp->completed gp number */
+	unsigned long	completed;	/* Track rsp->completed gp number */
 					/*  in order to detect GP end. */
-	long		gpnum;		/* Highest gp number that this CPU */
+	unsigned long	gpnum;		/* Highest gp number that this CPU */
 					/*  is aware of having started. */
-	long		passed_quiesc_completed;
+	unsigned long	passed_quiesc_completed;
 					/* Value of completed at time of qs. */
 	bool		passed_quiesc;	/* User-mode/idle loop etc. */
 	bool		qs_pending;	/* Core waits for quiesc state. */
@@ -221,14 +221,15 @@ struct rcu_data {
 	unsigned long resched_ipi;	/* Sent a resched IPI. */
 
 	/* 5) __rcu_pending() statistics. */
-	long n_rcu_pending;		/* rcu_pending() calls since boot. */
-	long n_rp_qs_pending;
-	long n_rp_cb_ready;
-	long n_rp_cpu_needs_gp;
-	long n_rp_gp_completed;
-	long n_rp_gp_started;
-	long n_rp_need_fqs;
-	long n_rp_need_nothing;
+	unsigned long n_rcu_pending;	/* rcu_pending() calls since boot. */
+	unsigned long n_rp_qs_pending;
+	unsigned long n_rp_report_qs;
+	unsigned long n_rp_cb_ready;
+	unsigned long n_rp_cpu_needs_gp;
+	unsigned long n_rp_gp_completed;
+	unsigned long n_rp_gp_started;
+	unsigned long n_rp_need_fqs;
+	unsigned long n_rp_need_nothing;
 
 	int cpu;
 };
@@ -237,25 +238,36 @@ struct rcu_data {
 #define RCU_GP_IDLE		0	/* No grace period in progress. */
 #define RCU_GP_INIT		1	/* Grace period being initialized. */
 #define RCU_SAVE_DYNTICK	2	/* Need to scan dyntick state. */
-#define RCU_SAVE_COMPLETED	3	/* Need to save rsp->completed. */
-#define RCU_FORCE_QS		4	/* Need to force quiescent state. */
+#define RCU_FORCE_QS		3	/* Need to force quiescent state. */
 #ifdef CONFIG_NO_HZ
 #define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
 #else /* #ifdef CONFIG_NO_HZ */
-#define RCU_SIGNAL_INIT		RCU_SAVE_COMPLETED
+#define RCU_SIGNAL_INIT		RCU_FORCE_QS
 #endif /* #else #ifdef CONFIG_NO_HZ */
 
 #define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ)  /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ)  /* for rsp->jiffies_stall */
-#define RCU_STALL_RAT_DELAY		2	  /* Allow other CPUs time */
-						  /*  to take at least one */
-						  /*  scheduling clock irq */
-						  /*  before ratting on them. */
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA	       (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA	       0
+#endif
+
+#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ + RCU_STALL_DELAY_DELTA)
+						/* for rsp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+						/* for rsp->jiffies_stall */
+#define RCU_STALL_RAT_DELAY		2	/* Allow other CPUs time */
+						/*  to take at least one */
+						/*  scheduling clock irq */
+						/*  before ratting on them. */
 
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
+#define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
+#define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
+
 /*
  * RCU global state, including node hierarchy.  This hierarchy is
  * represented in "heap" form in a dense array.  The root (first level)
@@ -277,12 +289,19 @@ struct rcu_state {
 
 	u8	signaled ____cacheline_internodealigned_in_smp;
 						/* Force QS state. */
-	long	gpnum;				/* Current gp number. */
-	long	completed;			/* # of last completed gp. */
+	u8	fqs_active;			/* force_quiescent_state() */
+						/*  is running. */
+	u8	fqs_need_gp;			/* A CPU was prevented from */
+						/*  starting a new grace */
+						/*  period because */
+						/*  force_quiescent_state() */
+						/*  was running. */
+	unsigned long gpnum;			/* Current gp number. */
+	unsigned long completed;		/* # of last completed gp. */
 
 	/* End of fields guarded by root rcu_node's lock. */
 
-	spinlock_t onofflock;			/* exclude on/offline and */
+	raw_spinlock_t onofflock;		/* exclude on/offline and */
 						/*  starting new GP.  Also */
 						/*  protects the following */
 						/*  orphan_cbs fields. */
@@ -292,10 +311,8 @@ struct rcu_state {
 						/*  going offline. */
 	struct rcu_head **orphan_cbs_tail;	/* And tail pointer. */
 	long orphan_qlen;			/* Number of orphaned cbs. */
-	spinlock_t fqslock;			/* Only one task forcing */
+	raw_spinlock_t fqslock;			/* Only one task forcing */
 						/*  quiescent states. */
-	long	completed_fqs;			/* Value of completed @ snap. */
-						/*  Protected by fqslock. */
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
 						/*  force_quiescent_state(). */
 	unsigned long n_force_qs;		/* Number of calls to */
@@ -310,6 +327,7 @@ struct rcu_state {
 	unsigned long jiffies_stall;		/* Time at which to check */
 						/*  for CPU stalls. */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+	char *name;				/* Name of structure. */
 };
 
 /* Return values for rcu_preempt_offline_tasks(). */
@@ -319,8 +337,6 @@ struct rcu_state {
 #define RCU_OFL_TASKS_EXP_GP	0x2		/* Tasks blocking expedited */
 						/*  GP were moved to root. */
 
-#ifdef RCU_TREE_NONCORE
-
 /*
  * RCU implementation internal declarations:
  */
@@ -335,7 +351,7 @@ extern struct rcu_state rcu_preempt_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 
-#else /* #ifdef RCU_TREE_NONCORE */
+#ifndef RCU_TREE_NONCORE
 
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
@@ -347,6 +363,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
 				      unsigned long flags);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static void rcu_print_task_stall(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
@@ -367,5 +384,6 @@ static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 static void rcu_preempt_send_cbs_to_orphanage(void);
 static void __init __rcu_init_preempt(void);
+static void rcu_needs_cpu_flush(void);
 
-#endif /* #else #ifdef RCU_TREE_NONCORE */
+#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 37fbccdf41d..0e4f420245d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
 
 #include <linux/delay.h>
 
+/*
+ * Check the RCU kernel configuration parameters and print informative
+ * messages about anything out of the ordinary.  If you like #ifdef, you
+ * will love this function.
+ */
+static void __init rcu_bootup_announce_oddness(void)
+{
+#ifdef CONFIG_RCU_TRACE
+	printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
+#endif
+#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
+	printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+	       CONFIG_RCU_FANOUT);
+#endif
+#ifdef CONFIG_RCU_FANOUT_EXACT
+	printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
+#endif
+#ifdef CONFIG_RCU_FAST_NO_HZ
+	printk(KERN_INFO
+	       "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+#endif
+#ifdef CONFIG_PROVE_RCU
+	printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
+#endif
+#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
+	printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
+#endif
+#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
+	printk(KERN_INFO
+	       "\tRCU-based detection of stalled CPUs is disabled.\n");
+#endif
+#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
+	printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
+#endif
+#if NUM_RCU_LVL_4 != 0
+	printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
+#endif
+}
+
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
 struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
  */
 static void __init rcu_bootup_announce(void)
 {
-	printk(KERN_INFO
-	       "Experimental preemptable hierarchical RCU implementation.\n");
+	printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
+	rcu_bootup_announce_oddness();
 }
 
 /*
@@ -62,17 +101,32 @@ long rcu_batches_completed(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 
 /*
+ * Force a quiescent state for preemptible RCU.
+ */
+void rcu_force_quiescent_state(void)
+{
+	force_quiescent_state(&rcu_preempt_state, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+
+/*
  * Record a preemptable-RCU quiescent state for the specified CPU.  Note
  * that this just means that the task currently running on the CPU is
  * not in a quiescent state.  There might be any number of tasks blocked
  * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
  */
 static void rcu_preempt_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
+
 	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
 	rdp->passed_quiesc = 1;
+	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
 
 /*
@@ -102,7 +156,7 @@ static void rcu_preempt_note_context_switch(int cpu)
 		/* Possibly blocking in an RCU read-side critical section. */
 		rdp = rcu_preempt_state.rda[cpu];
 		rnp = rdp->mynode;
-		spin_lock_irqsave(&rnp->lock, flags);
+		raw_spin_lock_irqsave(&rnp->lock, flags);
 		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
 		t->rcu_blocked_node = rnp;
 
@@ -123,7 +177,7 @@ static void rcu_preempt_note_context_switch(int cpu)
 		WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
 		phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
 		list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
-		spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	}
 
 	/*
@@ -135,9 +189,8 @@ static void rcu_preempt_note_context_switch(int cpu)
 	 * grace period, then the fact that the task has been enqueued
 	 * means that we continue to block the current grace period.
 	 */
-	rcu_preempt_qs(cpu);
 	local_irq_save(flags);
-	t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+	rcu_preempt_qs(cpu);
 	local_irq_restore(flags);
 }
 
@@ -180,7 +233,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 	struct rcu_node *rnp_p;
 
 	if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
-		spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;  /* Still need more quiescent states! */
 	}
 
@@ -197,8 +250,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 
 	/* Report up the rest of the hierarchy. */
 	mask = rnp->grpmask;
-	spin_unlock(&rnp->lock);	/* irqs remain disabled. */
-	spin_lock(&rnp_p->lock);	/* irqs already disabled. */
+	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
+	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
 	rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
 }
 
@@ -227,7 +280,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
 	 */
 	special = t->rcu_read_unlock_special;
 	if (special & RCU_READ_UNLOCK_NEED_QS) {
-		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 		rcu_preempt_qs(smp_processor_id());
 	}
 
@@ -248,10 +300,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		 */
 		for (;;) {
 			rnp = t->rcu_blocked_node;
-			spin_lock(&rnp->lock);  /* irqs already disabled. */
+			raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
 			if (rnp == t->rcu_blocked_node)
 				break;
-			spin_unlock(&rnp->lock);  /* irqs remain disabled. */
+			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
 		empty = !rcu_preempted_readers(rnp);
 		empty_exp = !rcu_preempted_readers_exp(rnp);
@@ -265,7 +317,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
 		 */
 		if (empty)
-			spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		else
 			rcu_report_unblock_qs_rnp(rnp, flags);
 
@@ -295,29 +347,73 @@ void __rcu_read_unlock(void)
 	if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
 	    unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
 		rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+	WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 
+#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
+
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period on the specified rcu_node structure.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+	unsigned long flags;
+	struct list_head *lp;
+	int phase;
+	struct task_struct *t;
+
+	if (rcu_preempted_readers(rnp)) {
+		raw_spin_lock_irqsave(&rnp->lock, flags);
+		phase = rnp->gpnum & 0x1;
+		lp = &rnp->blocked_tasks[phase];
+		list_for_each_entry(t, lp, rcu_node_entry)
+			sched_show_task(t);
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	}
+}
+
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period.
+ */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	rcu_print_detail_task_stall_rnp(rnp);
+	rcu_for_each_leaf_node(rsp, rnp)
+		rcu_print_detail_task_stall_rnp(rnp);
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
+
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
+
 /*
  * Scan the current list of tasks blocked within RCU read-side critical
  * sections, printing out the tid of each.
  */
 static void rcu_print_task_stall(struct rcu_node *rnp)
 {
-	unsigned long flags;
 	struct list_head *lp;
 	int phase;
 	struct task_struct *t;
 
 	if (rcu_preempted_readers(rnp)) {
-		spin_lock_irqsave(&rnp->lock, flags);
 		phase = rnp->gpnum & 0x1;
 		lp = &rnp->blocked_tasks[phase];
 		list_for_each_entry(t, lp, rcu_node_entry)
 			printk(" P%d", t->pid);
-		spin_unlock_irqrestore(&rnp->lock, flags);
 	}
 }
 
@@ -388,11 +484,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 		lp_root = &rnp_root->blocked_tasks[i];
 		while (!list_empty(lp)) {
 			tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
-			spin_lock(&rnp_root->lock); /* irqs already disabled */
+			raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
 			list_del(&tp->rcu_node_entry);
 			tp->rcu_blocked_node = rnp_root;
 			list_add(&tp->rcu_node_entry, lp_root);
-			spin_unlock(&rnp_root->lock); /* irqs remain disabled */
+			raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
 		}
 	}
 	return retval;
@@ -420,7 +516,6 @@ static void rcu_preempt_check_callbacks(int cpu)
 	struct task_struct *t = current;
 
 	if (t->rcu_read_lock_nesting == 0) {
-		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 		rcu_preempt_qs(cpu);
 		return;
 	}
@@ -462,11 +557,13 @@ void synchronize_rcu(void)
 	if (!rcu_scheduler_active)
 		return;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
@@ -516,7 +613,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
 	unsigned long flags;
 	unsigned long mask;
 
-	spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave(&rnp->lock, flags);
 	for (;;) {
 		if (!sync_rcu_preempt_exp_done(rnp))
 			break;
@@ -525,12 +622,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
 			break;
 		}
 		mask = rnp->grpmask;
-		spin_unlock(&rnp->lock); /* irqs remain disabled */
+		raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
 		rnp = rnp->parent;
-		spin_lock(&rnp->lock); /* irqs already disabled */
+		raw_spin_lock(&rnp->lock); /* irqs already disabled */
 		rnp->expmask &= ~mask;
 	}
-	spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
 /*
@@ -545,11 +642,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 {
 	int must_wait;
 
-	spin_lock(&rnp->lock); /* irqs already disabled */
+	raw_spin_lock(&rnp->lock); /* irqs already disabled */
 	list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
 	list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
 	must_wait = rcu_preempted_readers_exp(rnp);
-	spin_unlock(&rnp->lock); /* irqs remain disabled */
+	raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
 	if (!must_wait)
 		rcu_report_exp_rnp(rsp, rnp);
 }
@@ -594,13 +691,13 @@ void synchronize_rcu_expedited(void)
 	/* force all RCU readers onto blocked_tasks[]. */
 	synchronize_sched_expedited();
 
-	spin_lock_irqsave(&rsp->onofflock, flags);
+	raw_spin_lock_irqsave(&rsp->onofflock, flags);
 
 	/* Initialize ->expmask for all non-leaf rcu_node structures. */
 	rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
-		spin_lock(&rnp->lock); /* irqs already disabled. */
+		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 		rnp->expmask = rnp->qsmaskinit;
-		spin_unlock(&rnp->lock); /* irqs remain disabled. */
+		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 	}
 
 	/* Snapshot current state of ->blocked_tasks[] lists. */
@@ -609,7 +706,7 @@ void synchronize_rcu_expedited(void)
 	if (NUM_RCU_NODES > 1)
 		sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
 
-	spin_unlock_irqrestore(&rsp->onofflock, flags);
+	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 
 	/* Wait for snapshotted ->blocked_tasks[] lists to drain. */
 	rnp = rcu_get_root(rsp);
@@ -701,6 +798,7 @@ void exit_rcu(void)
 static void __init rcu_bootup_announce(void)
 {
 	printk(KERN_INFO "Hierarchical RCU implementation.\n");
+	rcu_bootup_announce_oddness();
 }
 
 /*
@@ -713,6 +811,16 @@ long rcu_batches_completed(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 
 /*
+ * Force a quiescent state for RCU, which, because there is no preemptible
+ * RCU, becomes the same as rcu-sched.
+ */
+void rcu_force_quiescent_state(void)
+{
+	rcu_sched_force_quiescent_state();
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+
+/*
  * Because preemptable RCU does not exist, we never have to check for
  * CPUs being in quiescent states.
  */
@@ -734,7 +842,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
 /* Because preemptible RCU does not exist, no quieting of tasks. */
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 {
-	spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -745,6 +853,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
  * Because preemptable RCU does not exist, we never have to check for
  * tasks blocked within RCU read-side critical sections.
  */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+}
+
+/*
+ * Because preemptable RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
 static void rcu_print_task_stall(struct rcu_node *rnp)
 {
 }
@@ -884,3 +1000,123 @@ static void __init __rcu_init_preempt(void)
 }
 
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+
+#if !defined(CONFIG_RCU_FAST_NO_HZ)
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ *
+ * Because we have preemptible RCU, just check whether this CPU needs
+ * any flavor of RCU.  Do not chew up lots of CPU cycles with preemption
+ * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	return rcu_needs_cpu_quick_check(cpu);
+}
+
+/*
+ * Check to see if we need to continue a callback-flush operations to
+ * allow the last CPU to enter dyntick-idle mode.  But fast dyntick-idle
+ * entry is not configured, so we never do need to.
+ */
+static void rcu_needs_cpu_flush(void)
+{
+}
+
+#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
+
+#define RCU_NEEDS_CPU_FLUSHES 5
+static DEFINE_PER_CPU(int, rcu_dyntick_drain);
+static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ *
+ * Because we are not supporting preemptible RCU, attempt to accelerate
+ * any current grace periods so that RCU no longer needs this CPU, but
+ * only if all other CPUs are already in dynticks-idle mode.  This will
+ * allow the CPU cores to be powered down immediately, as opposed to after
+ * waiting many milliseconds for grace periods to elapse.
+ *
+ * Because it is not legal to invoke rcu_process_callbacks() with irqs
+ * disabled, we do one pass of force_quiescent_state(), then do a
+ * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
+ * The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	int c = 0;
+	int snap;
+	int snap_nmi;
+	int thatcpu;
+
+	/* Check for being in the holdoff period. */
+	if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
+		return rcu_needs_cpu_quick_check(cpu);
+
+	/* Don't bother unless we are the last non-dyntick-idle CPU. */
+	for_each_online_cpu(thatcpu) {
+		if (thatcpu == cpu)
+			continue;
+		snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
+		snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
+		smp_mb(); /* Order sampling of snap with end of grace period. */
+		if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
+			per_cpu(rcu_dyntick_drain, cpu) = 0;
+			per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+			return rcu_needs_cpu_quick_check(cpu);
+		}
+	}
+
+	/* Check and update the rcu_dyntick_drain sequencing. */
+	if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+		/* First time through, initialize the counter. */
+		per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
+	} else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+		/* We have hit the limit, so time to give up. */
+		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
+		return rcu_needs_cpu_quick_check(cpu);
+	}
+
+	/* Do one step pushing remaining RCU callbacks through. */
+	if (per_cpu(rcu_sched_data, cpu).nxtlist) {
+		rcu_sched_qs(cpu);
+		force_quiescent_state(&rcu_sched_state, 0);
+		c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
+	}
+	if (per_cpu(rcu_bh_data, cpu).nxtlist) {
+		rcu_bh_qs(cpu);
+		force_quiescent_state(&rcu_bh_state, 0);
+		c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
+	}
+
+	/* If RCU callbacks are still pending, RCU still needs this CPU. */
+	if (c)
+		raise_softirq(RCU_SOFTIRQ);
+	return c;
+}
+
+/*
+ * Check to see if we need to continue a callback-flush operations to
+ * allow the last CPU to enter dyntick-idle mode.
+ */
+static void rcu_needs_cpu_flush(void)
+{
+	int cpu = smp_processor_id();
+	unsigned long flags;
+
+	if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
+		return;
+	local_irq_save(flags);
+	(void)rcu_needs_cpu(cpu);
+	local_irq_restore(flags);
+}
+
+#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9d2c88423b3..36c95b45738 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
 	if (!rdp->beenonline)
 		return;
-	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d",
+	seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   rdp->completed, rdp->gpnum,
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 {
 	if (!rdp->beenonline)
 		return;
-	seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
+	seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
 		   rdp->completed, rdp->gpnum,
@@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = {
 
 static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 {
-	long gpnum;
+	unsigned long gpnum;
 	int level = 0;
 	int phase;
 	struct rcu_node *rnp;
 
 	gpnum = rsp->gpnum;
-	seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
+	seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
 		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
 		   rsp->completed, gpnum, rsp->signaled,
 		   (long)(rsp->jiffies_force_qs - jiffies),
@@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = {
 static int show_rcugp(struct seq_file *m, void *unused)
 {
 #ifdef CONFIG_TREE_PREEMPT_RCU
-	seq_printf(m, "rcu_preempt: completed=%ld  gpnum=%ld\n",
+	seq_printf(m, "rcu_preempt: completed=%ld  gpnum=%lu\n",
 		   rcu_preempt_state.completed, rcu_preempt_state.gpnum);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-	seq_printf(m, "rcu_sched: completed=%ld  gpnum=%ld\n",
+	seq_printf(m, "rcu_sched: completed=%ld  gpnum=%lu\n",
 		   rcu_sched_state.completed, rcu_sched_state.gpnum);
-	seq_printf(m, "rcu_bh: completed=%ld  gpnum=%ld\n",
+	seq_printf(m, "rcu_bh: completed=%ld  gpnum=%lu\n",
 		   rcu_bh_state.completed, rcu_bh_state.gpnum);
 	return 0;
 }
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
 static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
 {
 	seq_printf(m, "%3d%cnp=%ld "
-		   "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+		   "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
+		   "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   rdp->n_rcu_pending,
 		   rdp->n_rp_qs_pending,
+		   rdp->n_rp_report_qs,
 		   rdp->n_rp_cb_ready,
 		   rdp->n_rp_cpu_needs_gp,
 		   rdp->n_rp_gp_completed,
diff --git a/kernel/relay.c b/kernel/relay.c
index c705a41b4ba..c7cf397fb92 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -539,7 +539,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
 					"relay_hotcpu_callback: cpu %d buffer "
 					"creation failed\n", hotcpu);
 				mutex_unlock(&relay_channels_mutex);
-				return NOTIFY_BAD;
+				return notifier_from_errno(-ENOMEM);
 			}
 		}
 		mutex_unlock(&relay_channels_mutex);
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
 /*
  *	subbuf_splice_actor - splice up to one subbuf's worth of data
  */
-static int subbuf_splice_actor(struct file *in,
+static ssize_t subbuf_splice_actor(struct file *in,
 			       loff_t *ppos,
 			       struct pipe_inode_info *pipe,
 			       size_t len,
 			       unsigned int flags,
 			       int *nonpad_ret)
 {
-	unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret;
+	unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
 	struct rchan_buf *rbuf = in->private_data;
 	unsigned int subbuf_size = rbuf->chan->subbuf_size;
 	uint64_t pos = (uint64_t) *ppos;
@@ -1231,8 +1231,8 @@ static int subbuf_splice_actor(struct file *in,
 	size_t read_subbuf = read_start / subbuf_size;
 	size_t padding = rbuf->padding[read_subbuf];
 	size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.nr_pages = 0,
@@ -1241,9 +1241,12 @@ static int subbuf_splice_actor(struct file *in,
 		.ops = &relay_pipe_buf_ops,
 		.spd_release = relay_page_release,
 	};
+	ssize_t ret;
 
 	if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
 		return 0;
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
 
 	/*
 	 * Adjust read len, if longer than what is available
@@ -1254,7 +1257,7 @@ static int subbuf_splice_actor(struct file *in,
 	subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
 	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
 	poff = read_start & ~PAGE_MASK;
-	nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
+	nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
 
 	for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
 		unsigned int this_len, this_end, private;
@@ -1288,16 +1291,19 @@ static int subbuf_splice_actor(struct file *in,
 		}
 	}
 
+	ret = 0;
 	if (!spd.nr_pages)
-		return 0;
+		goto out;
 
 	ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
 	if (ret < 0 || ret < total_len)
-		return ret;
+		goto out;
 
         if (read_start + ret == nonpad_end)
                 ret += padding;
 
+out:
+	splice_shrink_spd(pipe, &spd);
         return ret;
 }
 
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bcdabf37c40..c7eaa37a768 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,7 +10,6 @@
 #include <linux/types.h>
 #include <linux/parser.h>
 #include <linux/fs.h>
-#include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
 #include <linux/mm.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index af96c1e4b54..7b36976e5de 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/device.h>
 #include <linux/pfn.h>
@@ -188,20 +189,65 @@ static int __release_resource(struct resource *old)
 	return -EINVAL;
 }
 
+static void __release_child_resources(struct resource *r)
+{
+	struct resource *tmp, *p;
+	resource_size_t size;
+
+	p = r->child;
+	r->child = NULL;
+	while (p) {
+		tmp = p;
+		p = p->sibling;
+
+		tmp->parent = NULL;
+		tmp->sibling = NULL;
+		__release_child_resources(tmp);
+
+		printk(KERN_DEBUG "release child resource %pR\n", tmp);
+		/* need to restore size, and keep flags */
+		size = resource_size(tmp);
+		tmp->start = 0;
+		tmp->end = size - 1;
+	}
+}
+
+void release_child_resources(struct resource *r)
+{
+	write_lock(&resource_lock);
+	__release_child_resources(r);
+	write_unlock(&resource_lock);
+}
+
 /**
- * request_resource - request and reserve an I/O or memory resource
+ * request_resource_conflict - request and reserve an I/O or memory resource
  * @root: root resource descriptor
  * @new: resource descriptor desired by caller
  *
- * Returns 0 for success, negative error code on error.
+ * Returns 0 for success, conflict resource on error.
  */
-int request_resource(struct resource *root, struct resource *new)
+struct resource *request_resource_conflict(struct resource *root, struct resource *new)
 {
 	struct resource *conflict;
 
 	write_lock(&resource_lock);
 	conflict = __request_resource(root, new);
 	write_unlock(&resource_lock);
+	return conflict;
+}
+
+/**
+ * request_resource - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, negative error code on error.
+ */
+int request_resource(struct resource *root, struct resource *new)
+{
+	struct resource *conflict;
+
+	conflict = request_resource_conflict(root, new);
 	return conflict ? -EBUSY : 0;
 }
 
@@ -274,7 +320,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 		void *arg, int (*func)(unsigned long, unsigned long, void *))
 {
 	struct resource res;
-	unsigned long pfn, len;
+	unsigned long pfn, end_pfn;
 	u64 orig_end;
 	int ret = -1;
 
@@ -284,9 +330,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 	orig_end = res.end;
 	while ((res.start < res.end) &&
 		(find_next_system_ram(&res, "System RAM") >= 0)) {
-		pfn = (unsigned long)(res.start >> PAGE_SHIFT);
-		len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
-		ret = (*func)(pfn, len, arg);
+		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		end_pfn = (res.end + 1) >> PAGE_SHIFT;
+		if (end_pfn > pfn)
+			ret = (*func)(pfn, end_pfn - pfn, arg);
 		if (ret)
 			break;
 		res.start = res.end + 1;
@@ -297,14 +344,29 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 
 #endif
 
+static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+	return 1;
+}
+/*
+ * This generic page_is_ram() returns true if specified address is
+ * registered as "System RAM" in iomem_resource list.
+ */
+int __weak page_is_ram(unsigned long pfn)
+{
+	return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
+}
+
 /*
  * Find empty slot in the resource tree given range and alignment.
  */
 static int find_resource(struct resource *root, struct resource *new,
 			 resource_size_t size, resource_size_t min,
 			 resource_size_t max, resource_size_t align,
-			 void (*alignf)(void *, struct resource *,
-					resource_size_t, resource_size_t),
+			 resource_size_t (*alignf)(void *,
+						   const struct resource *,
+						   resource_size_t,
+						   resource_size_t),
 			 void *alignf_data)
 {
 	struct resource *this = root->child;
@@ -330,7 +392,7 @@ static int find_resource(struct resource *root, struct resource *new,
 			tmp.end = max;
 		tmp.start = ALIGN(tmp.start, align);
 		if (alignf)
-			alignf(alignf_data, &tmp, size, align);
+			tmp.start = alignf(alignf_data, &tmp, size, align);
 		if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
 			new->start = tmp.start;
 			new->end = tmp.start + size - 1;
@@ -358,8 +420,10 @@ static int find_resource(struct resource *root, struct resource *new,
 int allocate_resource(struct resource *root, struct resource *new,
 		      resource_size_t size, resource_size_t min,
 		      resource_size_t max, resource_size_t align,
-		      void (*alignf)(void *, struct resource *,
-				     resource_size_t, resource_size_t),
+		      resource_size_t (*alignf)(void *,
+						const struct resource *,
+						resource_size_t,
+						resource_size_t),
 		      void *alignf_data)
 {
 	int err;
@@ -426,25 +490,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
 }
 
 /**
- * insert_resource - Inserts a resource in the resource tree
+ * insert_resource_conflict - Inserts resource in the resource tree
  * @parent: parent of the new resource
  * @new: new resource to insert
  *
- * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ * Returns 0 on success, conflict resource if the resource can't be inserted.
  *
- * This function is equivalent to request_resource when no conflict
+ * This function is equivalent to request_resource_conflict when no conflict
  * happens. If a conflict happens, and the conflicting resources
  * entirely fit within the range of the new resource, then the new
  * resource is inserted and the conflicting resources become children of
  * the new resource.
  */
-int insert_resource(struct resource *parent, struct resource *new)
+struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
 {
 	struct resource *conflict;
 
 	write_lock(&resource_lock);
 	conflict = __insert_resource(parent, new);
 	write_unlock(&resource_lock);
+	return conflict;
+}
+
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+	struct resource *conflict;
+
+	conflict = insert_resource_conflict(parent, new);
 	return conflict ? -EBUSY : 0;
 }
 
@@ -603,6 +682,8 @@ resource_size_t resource_alignment(struct resource *res)
  * release_region releases a matching busy region.
  */
 
+static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
+
 /**
  * __request_region - create a new busy resource region
  * @parent: parent resource descriptor
@@ -615,6 +696,7 @@ struct resource * __request_region(struct resource *parent,
 				   resource_size_t start, resource_size_t n,
 				   const char *name, int flags)
 {
+	DECLARE_WAITQUEUE(wait, current);
 	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
 
 	if (!res)
@@ -639,7 +721,15 @@ struct resource * __request_region(struct resource *parent,
 			if (!(conflict->flags & IORESOURCE_BUSY))
 				continue;
 		}
-
+		if (conflict->flags & flags & IORESOURCE_MUXED) {
+			add_wait_queue(&muxed_resource_wait, &wait);
+			write_unlock(&resource_lock);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule();
+			remove_wait_queue(&muxed_resource_wait, &wait);
+			write_lock(&resource_lock);
+			continue;
+		}
 		/* Uhhuh, that didn't work out.. */
 		kfree(res);
 		res = NULL;
@@ -713,6 +803,8 @@ void __release_region(struct resource *parent, resource_size_t start,
 				break;
 			*p = res->sibling;
 			write_unlock(&resource_lock);
+			if (res->flags & IORESOURCE_MUXED)
+				wake_up(&muxed_resource_wait);
 			kfree(res);
 			return;
 		}
diff --git a/kernel/sched.c b/kernel/sched.c
index 4508fe7048b..dc85ceb9083 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
-#include <linux/kthread.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
@@ -71,11 +71,13 @@
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
+#include <linux/slab.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 
 #include "sched_cpupri.h"
+#include "workqueue_sched.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -233,7 +235,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
  */
 static DEFINE_MUTEX(sched_domains_mutex);
 
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 
 #include <linux/cgroup.h>
 
@@ -243,13 +245,7 @@ static LIST_HEAD(task_groups);
 
 /* task group related information */
 struct task_group {
-#ifdef CONFIG_CGROUP_SCHED
 	struct cgroup_subsys_state css;
-#endif
-
-#ifdef CONFIG_USER_SCHED
-	uid_t uid;
-#endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */
@@ -274,35 +270,7 @@ struct task_group {
 	struct list_head children;
 };
 
-#ifdef CONFIG_USER_SCHED
-
-/* Helper function to pass uid information to create_sched_user() */
-void set_tg_uid(struct user_struct *user)
-{
-	user->tg->uid = user->uid;
-}
-
-/*
- * Root task group.
- *	Every UID task group (including init_task_group aka UID-0) will
- *	be a child to this group.
- */
-struct task_group root_task_group;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Default task group's sched entity on each cpu */
-static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
-/* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
-#endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
-#endif /* CONFIG_USER_SCHED */
 
 /* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
@@ -318,11 +286,7 @@ static int root_task_group_empty(void)
 }
 #endif
 
-#ifdef CONFIG_USER_SCHED
-# define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
-#else /* !CONFIG_USER_SCHED */
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
-#endif /* CONFIG_USER_SCHED */
 
 /*
  * A weight of 0 or 1 can cause arithmetics problems.
@@ -343,47 +307,7 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
  */
 struct task_group init_task_group;
 
-/* return group to which a task belongs */
-static inline struct task_group *task_group(struct task_struct *p)
-{
-	struct task_group *tg;
-
-#ifdef CONFIG_USER_SCHED
-	rcu_read_lock();
-	tg = __task_cred(p)->user->tg;
-	rcu_read_unlock();
-#elif defined(CONFIG_CGROUP_SCHED)
-	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
-				struct task_group, css);
-#else
-	tg = &init_task_group;
-#endif
-	return tg;
-}
-
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
-	p->se.parent = task_group(p)->se[cpu];
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
-	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
-	p->rt.parent = task_group(p)->rt_se[cpu];
-#endif
-}
-
-#else
-
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
-	return NULL;
-}
-
-#endif	/* CONFIG_GROUP_SCHED */
+#endif	/* CONFIG_CGROUP_SCHED */
 
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
@@ -478,7 +402,6 @@ struct rt_rq {
 	struct rq *rq;
 	struct list_head leaf_rt_rq_list;
 	struct task_group *tg;
-	struct sched_rt_entity *rt_se;
 #endif
 };
 
@@ -534,9 +457,13 @@ struct rq {
 	unsigned long nr_running;
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+	unsigned long last_load_update_tick;
 #ifdef CONFIG_NO_HZ
-	unsigned char in_nohz_recently;
+	u64 nohz_stamp;
+	unsigned char nohz_balance_kick;
 #endif
+	unsigned int skip_clock_update;
+
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
 	unsigned long nr_load_updates;
@@ -573,20 +500,20 @@ struct rq {
 	struct root_domain *rd;
 	struct sched_domain *sd;
 
+	unsigned long cpu_power;
+
 	unsigned char idle_at_tick;
 	/* For active balancing */
 	int post_schedule;
 	int active_balance;
 	int push_cpu;
+	struct cpu_stop_work active_balance_work;
 	/* cpu of this runqueue: */
 	int cpu;
 	int online;
 
 	unsigned long avg_load_per_task;
 
-	struct task_struct *migration_thread;
-	struct list_head migration_queue;
-
 	u64 rt_avg;
 	u64 age_stamp;
 	u64 idle_stamp;
@@ -634,6 +561,13 @@ static inline
 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
 	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+
+	/*
+	 * A queue event has occurred, and we're going to schedule.  In
+	 * this case, we can save a useless back to back clock update.
+	 */
+	if (test_tsk_need_resched(p))
+		rq->skip_clock_update = 1;
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -645,6 +579,11 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
+#define rcu_dereference_check_sched_domain(p) \
+	rcu_dereference_check((p), \
+			      rcu_read_lock_sched_held() || \
+			      lockdep_is_held(&sched_domains_mutex))
+
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +592,7 @@ static inline int cpu_of(struct rq *rq)
  * preempt-disabled sections.
  */
 #define for_each_domain(cpu, __sd) \
-	for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
@@ -661,9 +600,53 @@ static inline int cpu_of(struct rq *rq)
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define raw_rq()		(&__raw_get_cpu_var(runqueues))
 
+#ifdef CONFIG_CGROUP_SCHED
+
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We use task_subsys_state_check() and extend the RCU verification
+ * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * holds that lock for each task it moves into the cgroup. Therefore
+ * by holding that lock, we pin the task to the current cgroup.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+	struct cgroup_subsys_state *css;
+
+	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+			lockdep_is_held(&task_rq(p)->lock));
+	return container_of(css, struct task_group, css);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
+	p->se.parent = task_group(p)->se[cpu];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
+	p->rt.parent = task_group(p)->rt_se[cpu];
+#endif
+}
+
+#else /* CONFIG_CGROUP_SCHED */
+
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+	return NULL;
+}
+
+#endif /* CONFIG_CGROUP_SCHED */
+
 inline void update_rq_clock(struct rq *rq)
 {
-	rq->clock = sched_clock_cpu(cpu_of(rq));
+	if (!rq->skip_clock_update)
+		rq->clock = sched_clock_cpu(cpu_of(rq));
 }
 
 /*
@@ -941,14 +924,25 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
+ * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * against ttwu().
+ */
+static inline int task_is_waking(struct task_struct *p)
+{
+	return unlikely(p->state == TASK_WAKING);
+}
+
+/*
  * __task_rq_lock - lock the runqueue a given task resides on.
  * Must be called interrupts disabled.
  */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
+	struct rq *rq;
+
 	for (;;) {
-		struct rq *rq = task_rq(p);
+		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
@@ -976,14 +970,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	}
 }
 
-void task_rq_unlock_wait(struct task_struct *p)
-{
-	struct rq *rq = task_rq(p);
-
-	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
-	raw_spin_unlock_wait(&rq->lock);
-}
-
 static void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
@@ -1209,6 +1195,27 @@ static void resched_cpu(int cpu)
 
 #ifdef CONFIG_NO_HZ
 /*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+	struct sched_domain *sd;
+
+	for_each_domain(cpu, sd) {
+		for_each_cpu(i, sched_domain_span(sd))
+			if (!idle_cpu(i))
+				return i;
+	}
+	return cpu;
+}
+/*
  * When add_timer_on() enqueues a timer into the timer wheel of an
  * idle CPU then this timer might expire before the next timer event
  * which is scheduled to wake up that CPU. In case of a completely
@@ -1247,6 +1254,7 @@ void wake_up_idle_cpu(int cpu)
 	if (!tsk_is_polling(rq->idle))
 		smp_send_reschedule(cpu);
 }
+
 #endif /* CONFIG_NO_HZ */
 
 static u64 sched_avg_period(void)
@@ -1259,6 +1267,12 @@ static void sched_avg_update(struct rq *rq)
 	s64 period = sched_avg_period();
 
 	while ((s64)(rq->clock - rq->age_stamp) > period) {
+		/*
+		 * Inline assembly required to prevent the compiler
+		 * optimising this loop into a divmod call.
+		 * See __iter_div_u64_rem() for another example of this.
+		 */
+		asm("" : "+rm" (rq->age_stamp));
 		rq->age_stamp += period;
 		rq->rt_avg /= 2;
 	}
@@ -1280,6 +1294,10 @@ static void resched_task(struct task_struct *p)
 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 }
+
+static void sched_avg_update(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 
 #if BITS_PER_LONG == 32
@@ -1390,32 +1408,6 @@ static const u32 prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
-
-/*
- * runqueue iterator, to support SMP load-balancing between different
- * scheduling classes, without having to expose their internal data
- * structures to the load-balancing proper:
- */
-struct rq_iterator {
-	void *arg;
-	struct task_struct *(*start)(void *);
-	struct task_struct *(*next)(void *);
-};
-
-#ifdef CONFIG_SMP
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-	      unsigned long max_load_move, struct sched_domain *sd,
-	      enum cpu_idle_type idle, int *all_pinned,
-	      int *this_best_prio, struct rq_iterator *iterator);
-
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		   struct sched_domain *sd, enum cpu_idle_type idle,
-		   struct rq_iterator *iterator);
-#endif
-
 /* Time spent by the tasks of the cpu accounting group executing in ... */
 enum cpuacct_stat_index {
 	CPUACCT_STAT_USER,	/* ... user mode */
@@ -1529,24 +1521,9 @@ static unsigned long target_load(int cpu, int type)
 	return max(rq->cpu_load[type-1], total);
 }
 
-static struct sched_group *group_of(int cpu)
-{
-	struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
-
-	if (!sd)
-		return NULL;
-
-	return sd->groups;
-}
-
 static unsigned long power_of(int cpu)
 {
-	struct sched_group *group = group_of(cpu);
-
-	if (!group)
-		return SCHED_LOAD_SCALE;
-
-	return group->cpu_power;
+	return cpu_rq(cpu)->cpu_power;
 }
 
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1566,7 +1543,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-static __read_mostly unsigned long *update_shares_data;
+static __read_mostly unsigned long __percpu *update_shares_data;
 
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 
@@ -1692,7 +1669,7 @@ static void update_shares(struct sched_domain *sd)
 	if (root_task_group_empty())
 		return;
 
-	now = cpu_clock(raw_smp_processor_id());
+	now = local_clock();
 	elapsed = now - sd->last_update;
 
 	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1701,21 +1678,8 @@ static void update_shares(struct sched_domain *sd)
 	}
 }
 
-static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-	if (root_task_group_empty())
-		return;
-
-	raw_spin_unlock(&rq->lock);
-	update_shares(sd);
-	raw_spin_lock(&rq->lock);
-}
-
 static void update_h_load(long cpu)
 {
-	if (root_task_group_empty())
-		return;
-
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
@@ -1725,10 +1689,6 @@ static inline void update_shares(struct sched_domain *sd)
 {
 }
 
-static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-}
-
 #endif
 
 #ifdef CONFIG_PREEMPT
@@ -1805,6 +1765,49 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 	raw_spin_unlock(&busiest->lock);
 	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
+{
+	BUG_ON(!irqs_disabled());
+	if (rq1 == rq2) {
+		raw_spin_lock(&rq1->lock);
+		__acquire(rq2->lock);	/* Fake it out ;) */
+	} else {
+		if (rq1 < rq2) {
+			raw_spin_lock(&rq1->lock);
+			raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+		} else {
+			raw_spin_lock(&rq2->lock);
+			raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+		}
+	}
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->lock)
+	__releases(rq2->lock)
+{
+	raw_spin_unlock(&rq1->lock);
+	if (rq1 != rq2)
+		raw_spin_unlock(&rq2->lock);
+	else
+		__release(rq2->lock);
+}
+
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1816,9 +1819,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
 
-static void calc_load_account_active(struct rq *this_rq);
+static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
+static void update_cpu_load(struct rq *this_rq);
 
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
@@ -1834,18 +1838,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 #endif
 }
 
-#include "sched_stats.h"
-#include "sched_idletask.c"
-#include "sched_fair.c"
-#include "sched_rt.c"
-#ifdef CONFIG_SCHED_DEBUG
-# include "sched_debug.c"
-#endif
+static const struct sched_class rt_sched_class;
 
 #define sched_class_highest (&rt_sched_class)
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
 
+#include "sched_stats.h"
+
 static void inc_nr_running(struct rq *rq)
 {
 	rq->nr_running++;
@@ -1859,8 +1859,8 @@ static void dec_nr_running(struct rq *rq)
 static void set_load_weight(struct task_struct *p)
 {
 	if (task_has_rt_policy(p)) {
-		p->se.load.weight = prio_to_weight[0] * 2;
-		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+		p->se.load.weight = 0;
+		p->se.load.inv_weight = WMULT_CONST;
 		return;
 	}
 
@@ -1877,40 +1877,53 @@ static void set_load_weight(struct task_struct *p)
 	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 
-static void update_avg(u64 *avg, u64 sample)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-	s64 diff = sample - *avg;
-	*avg += diff >> 3;
+	update_rq_clock(rq);
+	sched_info_queued(p);
+	p->sched_class->enqueue_task(rq, p, flags);
+	p->se.on_rq = 1;
 }
 
-static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-	if (wakeup)
-		p->se.start_runtime = p->se.sum_exec_runtime;
+	update_rq_clock(rq);
+	sched_info_dequeued(p);
+	p->sched_class->dequeue_task(rq, p, flags);
+	p->se.on_rq = 0;
+}
 
-	sched_info_queued(p);
-	p->sched_class->enqueue_task(rq, p, wakeup);
-	p->se.on_rq = 1;
+/*
+ * activate_task - move a task to the runqueue.
+ */
+static void activate_task(struct rq *rq, struct task_struct *p, int flags)
+{
+	if (task_contributes_to_load(p))
+		rq->nr_uninterruptible--;
+
+	enqueue_task(rq, p, flags);
+	inc_nr_running(rq);
 }
 
-static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
+/*
+ * deactivate_task - remove a task from the runqueue.
+ */
+static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
-	if (sleep) {
-		if (p->se.last_wakeup) {
-			update_avg(&p->se.avg_overlap,
-				p->se.sum_exec_runtime - p->se.last_wakeup);
-			p->se.last_wakeup = 0;
-		} else {
-			update_avg(&p->se.avg_wakeup,
-				sysctl_sched_wakeup_granularity);
-		}
-	}
+	if (task_contributes_to_load(p))
+		rq->nr_uninterruptible++;
 
-	sched_info_dequeued(p);
-	p->sched_class->dequeue_task(rq, p, sleep);
-	p->se.on_rq = 0;
+	dequeue_task(rq, p, flags);
+	dec_nr_running(rq);
 }
 
+#include "sched_idletask.c"
+#include "sched_fair.c"
+#include "sched_rt.c"
+#ifdef CONFIG_SCHED_DEBUG
+# include "sched_debug.c"
+#endif
+
 /*
  * __normal_prio - return the priority that is based on the static prio
  */
@@ -1957,30 +1970,6 @@ static int effective_prio(struct task_struct *p)
 	return p->prio;
 }
 
-/*
- * activate_task - move a task to the runqueue.
- */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
-{
-	if (task_contributes_to_load(p))
-		rq->nr_uninterruptible--;
-
-	enqueue_task(rq, p, wakeup);
-	inc_nr_running(rq);
-}
-
-/*
- * deactivate_task - remove a task from the runqueue.
- */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
-{
-	if (task_contributes_to_load(p))
-		rq->nr_uninterruptible++;
-
-	dequeue_task(rq, p, sleep);
-	dec_nr_running(rq);
-}
-
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
@@ -2053,21 +2042,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	__set_task_cpu(p, new_cpu);
 }
 
-struct migration_req {
-	struct list_head list;
-
+struct migration_arg {
 	struct task_struct *task;
 	int dest_cpu;
-
-	struct completion done;
 };
 
+static int migration_cpu_stop(void *data);
+
 /*
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
-static int
-migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
+static bool migrate_task(struct task_struct *p, int dest_cpu)
 {
 	struct rq *rq = task_rq(p);
 
@@ -2075,58 +2061,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	 * If the task is not on a runqueue (and not running), then
 	 * the next wake-up will properly place the task.
 	 */
-	if (!p->se.on_rq && !task_running(rq, p))
-		return 0;
-
-	init_completion(&req->done);
-	req->task = p;
-	req->dest_cpu = dest_cpu;
-	list_add(&req->list, &rq->migration_queue);
-
-	return 1;
-}
-
-/*
- * wait_task_context_switch -	wait for a thread to complete at least one
- *				context switch.
- *
- * @p must not be current.
- */
-void wait_task_context_switch(struct task_struct *p)
-{
-	unsigned long nvcsw, nivcsw, flags;
-	int running;
-	struct rq *rq;
-
-	nvcsw	= p->nvcsw;
-	nivcsw	= p->nivcsw;
-	for (;;) {
-		/*
-		 * The runqueue is assigned before the actual context
-		 * switch. We need to take the runqueue lock.
-		 *
-		 * We could check initially without the lock but it is
-		 * very likely that we need to take the lock in every
-		 * iteration.
-		 */
-		rq = task_rq_lock(p, &flags);
-		running = task_running(rq, p);
-		task_rq_unlock(rq, &flags);
-
-		if (likely(!running))
-			break;
-		/*
-		 * The switch count is incremented before the actual
-		 * context switch. We thus wait for two switches to be
-		 * sure at least one completed.
-		 */
-		if ((p->nvcsw - nvcsw) > 1)
-			break;
-		if ((p->nivcsw - nivcsw) > 1)
-			break;
-
-		cpu_relax();
-	}
+	return p->se.on_rq || task_running(rq, p);
 }
 
 /*
@@ -2184,7 +2119,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * just go back and repeat.
 		 */
 		rq = task_rq_lock(p, &flags);
-		trace_sched_wait_task(rq, p);
+		trace_sched_wait_task(p);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
 		ncsw = 0;
@@ -2282,6 +2217,9 @@ void task_oncpu_function_call(struct task_struct *p,
 }
 
 #ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
 	int dest_cpu;
@@ -2298,12 +2236,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 		return dest_cpu;
 
 	/* No more Mr. Nice Guy. */
-	if (dest_cpu >= nr_cpu_ids) {
-		rcu_read_lock();
-		cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
-		rcu_read_unlock();
-		dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
-
+	if (unlikely(dest_cpu >= nr_cpu_ids)) {
+		dest_cpu = cpuset_cpus_allowed_fallback(p);
 		/*
 		 * Don't tell them about moving exiting tasks or
 		 * kernel threads (both mm NULL), since they never
@@ -2320,19 +2254,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 
 /*
- * Called from:
- *
- *  - fork, @p is stable because it isn't on the tasklist yet
- *
- *  - exec, @p is unstable, retry loop
- *
- *  - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
- *             we should be good.
+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
 {
-	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+	int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
@@ -2350,13 +2277,63 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 
 	return cpu;
 }
+
+static void update_avg(u64 *avg, u64 sample)
+{
+	s64 diff = sample - *avg;
+	*avg += diff >> 3;
+}
 #endif
 
-/***
+static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+				 bool is_sync, bool is_migrate, bool is_local,
+				 unsigned long en_flags)
+{
+	schedstat_inc(p, se.statistics.nr_wakeups);
+	if (is_sync)
+		schedstat_inc(p, se.statistics.nr_wakeups_sync);
+	if (is_migrate)
+		schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+	if (is_local)
+		schedstat_inc(p, se.statistics.nr_wakeups_local);
+	else
+		schedstat_inc(p, se.statistics.nr_wakeups_remote);
+
+	activate_task(rq, p, en_flags);
+}
+
+static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+					int wake_flags, bool success)
+{
+	trace_sched_wakeup(p, success);
+	check_preempt_curr(rq, p, wake_flags);
+
+	p->state = TASK_RUNNING;
+#ifdef CONFIG_SMP
+	if (p->sched_class->task_woken)
+		p->sched_class->task_woken(rq, p);
+
+	if (unlikely(rq->idle_stamp)) {
+		u64 delta = rq->clock - rq->idle_stamp;
+		u64 max = 2*sysctl_sched_migration_cost;
+
+		if (delta > max)
+			rq->avg_idle = max;
+		else
+			update_avg(&rq->avg_idle, delta);
+		rq->idle_stamp = 0;
+	}
+#endif
+	/* if a worker is waking up, notify workqueue */
+	if ((p->flags & PF_WQ_WORKER) && success)
+		wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/**
  * try_to_wake_up - wake up a thread
- * @p: the to-be-woken-up thread
+ * @p: the thread to be awakened
  * @state: the mask of task states that can be woken
- * @sync: do a synchronous wakeup?
+ * @wake_flags: wake modifier flags (WF_*)
  *
  * Put it on the run-queue if it's not already there. The "current"
  * thread is always on the run-queue (except when the actual
@@ -2364,23 +2341,21 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
  * the simpler "current->state = TASK_RUNNING" to mark yourself
  * runnable without the overhead of this.
  *
- * returns failure only if the task is already active.
+ * Returns %true if @p was woken up, %false if it was already running
+ * or @state didn't match @p's state.
  */
 static int try_to_wake_up(struct task_struct *p, unsigned int state,
 			  int wake_flags)
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
-	struct rq *rq, *orig_rq;
-
-	if (!sched_feat(SYNC_WAKEUPS))
-		wake_flags &= ~WF_SYNC;
+	unsigned long en_flags = ENQUEUE_WAKEUP;
+	struct rq *rq;
 
 	this_cpu = get_cpu();
 
 	smp_wmb();
-	rq = orig_rq = task_rq_lock(p, &flags);
-	update_rq_clock(rq);
+	rq = task_rq_lock(p, &flags);
 	if (!(p->state & state))
 		goto out;
 
@@ -2400,24 +2375,35 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	 *
 	 * First fix up the nr_uninterruptible count:
 	 */
-	if (task_contributes_to_load(p))
-		rq->nr_uninterruptible--;
+	if (task_contributes_to_load(p)) {
+		if (likely(cpu_online(orig_cpu)))
+			rq->nr_uninterruptible--;
+		else
+			this_rq()->nr_uninterruptible--;
+	}
 	p->state = TASK_WAKING;
 
-	if (p->sched_class->task_waking)
+	if (p->sched_class->task_waking) {
 		p->sched_class->task_waking(rq, p);
+		en_flags |= ENQUEUE_WAKING;
+	}
 
-	__task_rq_unlock(rq);
-
-	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+	cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
 	if (cpu != orig_cpu)
 		set_task_cpu(p, cpu);
+	__task_rq_unlock(rq);
 
-	rq = __task_rq_lock(p);
-	update_rq_clock(rq);
+	rq = cpu_rq(cpu);
+	raw_spin_lock(&rq->lock);
 
+	/*
+	 * We migrated the task without holding either rq->lock, however
+	 * since the task is not on the task list itself, nobody else
+	 * will try and migrate the task, hence the rq should match the
+	 * cpu we just moved it to.
+	 */
+	WARN_ON(task_cpu(p) != cpu);
 	WARN_ON(p->state != TASK_WAKING);
-	cpu = task_cpu(p);
 
 #ifdef CONFIG_SCHEDSTATS
 	schedstat_inc(rq, ttwu_count);
@@ -2436,54 +2422,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 
 out_activate:
 #endif /* CONFIG_SMP */
-	schedstat_inc(p, se.nr_wakeups);
-	if (wake_flags & WF_SYNC)
-		schedstat_inc(p, se.nr_wakeups_sync);
-	if (orig_cpu != cpu)
-		schedstat_inc(p, se.nr_wakeups_migrate);
-	if (cpu == this_cpu)
-		schedstat_inc(p, se.nr_wakeups_local);
-	else
-		schedstat_inc(p, se.nr_wakeups_remote);
-	activate_task(rq, p, 1);
+	ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
+		      cpu == this_cpu, en_flags);
 	success = 1;
-
-	/*
-	 * Only attribute actual wakeups done by this task.
-	 */
-	if (!in_interrupt()) {
-		struct sched_entity *se = &current->se;
-		u64 sample = se->sum_exec_runtime;
-
-		if (se->last_wakeup)
-			sample -= se->last_wakeup;
-		else
-			sample -= se->start_runtime;
-		update_avg(&se->avg_wakeup, sample);
-
-		se->last_wakeup = se->sum_exec_runtime;
-	}
-
 out_running:
-	trace_sched_wakeup(rq, p, success);
-	check_preempt_curr(rq, p, wake_flags);
-
-	p->state = TASK_RUNNING;
-#ifdef CONFIG_SMP
-	if (p->sched_class->task_woken)
-		p->sched_class->task_woken(rq, p);
-
-	if (unlikely(rq->idle_stamp)) {
-		u64 delta = rq->clock - rq->idle_stamp;
-		u64 max = 2*sysctl_sched_migration_cost;
-
-		if (delta > max)
-			rq->avg_idle = max;
-		else
-			update_avg(&rq->avg_idle, delta);
-		rq->idle_stamp = 0;
-	}
-#endif
+	ttwu_post_activation(p, rq, wake_flags, success);
 out:
 	task_rq_unlock(rq, &flags);
 	put_cpu();
@@ -2492,6 +2435,37 @@ out:
 }
 
 /**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not alredy there.  The caller must
+ * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ * the current task.  this_rq() stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+	struct rq *rq = task_rq(p);
+	bool success = false;
+
+	BUG_ON(rq != this_rq());
+	BUG_ON(p == current);
+	lockdep_assert_held(&rq->lock);
+
+	if (!(p->state & TASK_NORMAL))
+		return;
+
+	if (!p->se.on_rq) {
+		if (likely(!task_running(rq, p))) {
+			schedstat_inc(rq, ttwu_count);
+			schedstat_inc(rq, ttwu_local);
+		}
+		ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+		success = true;
+	}
+	ttwu_post_activation(p, rq, 0, success);
+}
+
+/**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
  *
@@ -2525,42 +2499,9 @@ static void __sched_fork(struct task_struct *p)
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
-	p->se.last_wakeup		= 0;
-	p->se.avg_overlap		= 0;
-	p->se.start_runtime		= 0;
-	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;
 
 #ifdef CONFIG_SCHEDSTATS
-	p->se.wait_start			= 0;
-	p->se.wait_max				= 0;
-	p->se.wait_count			= 0;
-	p->se.wait_sum				= 0;
-
-	p->se.sleep_start			= 0;
-	p->se.sleep_max				= 0;
-	p->se.sum_sleep_runtime			= 0;
-
-	p->se.block_start			= 0;
-	p->se.block_max				= 0;
-	p->se.exec_max				= 0;
-	p->se.slice_max				= 0;
-
-	p->se.nr_migrations_cold		= 0;
-	p->se.nr_failed_migrations_affine	= 0;
-	p->se.nr_failed_migrations_running	= 0;
-	p->se.nr_failed_migrations_hot		= 0;
-	p->se.nr_forced_migrations		= 0;
-
-	p->se.nr_wakeups			= 0;
-	p->se.nr_wakeups_sync			= 0;
-	p->se.nr_wakeups_migrate		= 0;
-	p->se.nr_wakeups_local			= 0;
-	p->se.nr_wakeups_remote			= 0;
-	p->se.nr_wakeups_affine			= 0;
-	p->se.nr_wakeups_affine_attempts	= 0;
-	p->se.nr_wakeups_passive		= 0;
-	p->se.nr_wakeups_idle			= 0;
-
+	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
 	INIT_LIST_HEAD(&p->rt.run_list);
@@ -2581,11 +2522,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
 
 	__sched_fork(p);
 	/*
-	 * We mark the process as waking here. This guarantees that
+	 * We mark the process as running here. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
-	p->state = TASK_WAKING;
+	p->state = TASK_RUNNING;
 
 	/*
 	 * Revert to default priority/policy on fork if requested.
@@ -2620,10 +2561,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	if (p->sched_class->task_fork)
 		p->sched_class->task_fork(p);
 
-#ifdef CONFIG_SMP
-	cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
+	/*
+	 * The child is not yet in the pid-hash so no cgroup attach races,
+	 * and the cgroup is pinned to this child due to cgroup_fork()
+	 * is ran before sched_fork().
+	 *
+	 * Silence PROVE_RCU.
+	 */
+	rcu_read_lock();
 	set_task_cpu(p, cpu);
+	rcu_read_unlock();
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
@@ -2652,19 +2599,37 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
 	unsigned long flags;
 	struct rq *rq;
+	int cpu __maybe_unused = get_cpu();
 
+#ifdef CONFIG_SMP
 	rq = task_rq_lock(p, &flags);
-	BUG_ON(p->state != TASK_WAKING);
+	p->state = TASK_WAKING;
+
+	/*
+	 * Fork balancing, do it here and not earlier because:
+	 *  - cpus_allowed can change in the fork path
+	 *  - any previously selected cpu might disappear through hotplug
+	 *
+	 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
+	 * without people poking at ->cpus_allowed.
+	 */
+	cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
+	set_task_cpu(p, cpu);
+
 	p->state = TASK_RUNNING;
-	update_rq_clock(rq);
+	task_rq_unlock(rq, &flags);
+#endif
+
+	rq = task_rq_lock(p, &flags);
 	activate_task(rq, p, 0);
-	trace_sched_wakeup_new(rq, p, 1);
+	trace_sched_wakeup_new(p, 1);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken)
 		p->sched_class->task_woken(rq, p);
 #endif
 	task_rq_unlock(rq, &flags);
+	put_cpu();
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2783,7 +2748,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
-	perf_event_task_sched_in(current, cpu_of(rq));
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_disable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+	perf_event_task_sched_in(current);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_enable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
 	finish_lock_switch(rq, prev);
 
 	fire_sched_in_preempt_notifiers(current);
@@ -2871,7 +2842,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
-	trace_sched_switch(rq, prev, next);
+	trace_sched_switch(prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
@@ -2969,9 +2940,9 @@ unsigned long nr_iowait(void)
 	return sum;
 }
 
-unsigned long nr_iowait_cpu(void)
+unsigned long nr_iowait_cpu(int cpu)
 {
-	struct rq *this = this_rq();
+	struct rq *this = cpu_rq(cpu);
 	return atomic_read(&this->nr_iowait);
 }
 
@@ -2988,57 +2959,9 @@ static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 
-/**
- * get_avenrun - get the load average array
- * @loads:	pointer to dest load array
- * @offset:	offset to add
- * @shift:	shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-	loads[0] = (avenrun[0] + offset) << shift;
-	loads[1] = (avenrun[1] + offset) << shift;
-	loads[2] = (avenrun[2] + offset) << shift;
-}
-
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
+static long calc_load_fold_active(struct rq *this_rq)
 {
-	load *= exp;
-	load += active * (FIXED_1 - exp);
-	return load >> FSHIFT;
-}
-
-/*
- * calc_load - update the avenrun load estimates 10 ticks after the
- * CPUs have updated calc_load_tasks.
- */
-void calc_global_load(void)
-{
-	unsigned long upd = calc_load_update + 10;
-	long active;
-
-	if (time_before(jiffies, upd))
-		return;
-
-	active = atomic_long_read(&calc_load_tasks);
-	active = active > 0 ? active * FIXED_1 : 0;
-
-	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
-	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
-	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-
-	calc_load_update += LOAD_FREQ;
-}
-
-/*
- * Either called from update_cpu_load() or from a cpu going idle
- */
-static void calc_load_account_active(struct rq *this_rq)
-{
-	long nr_active, delta;
+	long nr_active, delta = 0;
 
 	nr_active = this_rq->nr_running;
 	nr_active += (long) this_rq->nr_uninterruptible;
@@ -3046,1902 +2969,265 @@ static void calc_load_account_active(struct rq *this_rq)
 	if (nr_active != this_rq->calc_load_active) {
 		delta = nr_active - this_rq->calc_load_active;
 		this_rq->calc_load_active = nr_active;
-		atomic_long_add(delta, &calc_load_tasks);
-	}
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
- */
-static void update_cpu_load(struct rq *this_rq)
-{
-	unsigned long this_load = this_rq->load.weight;
-	int i, scale;
-
-	this_rq->nr_load_updates++;
-
-	/* Update our load: */
-	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-		unsigned long old_load, new_load;
-
-		/* scale is effectively 1 << i now, and >> i divides by scale */
-
-		old_load = this_rq->cpu_load[i];
-		new_load = this_load;
-		/*
-		 * Round up the averaging division if load is increasing. This
-		 * prevents us from getting stuck on 9 if the load is 10, for
-		 * example.
-		 */
-		if (new_load > old_load)
-			new_load += scale-1;
-		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
 
-	if (time_after_eq(jiffies, this_rq->calc_load_update)) {
-		this_rq->calc_load_update += LOAD_FREQ;
-		calc_load_account_active(this_rq);
-	}
-}
-
-#ifdef CONFIG_SMP
-
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
-	__acquires(rq1->lock)
-	__acquires(rq2->lock)
-{
-	BUG_ON(!irqs_disabled());
-	if (rq1 == rq2) {
-		raw_spin_lock(&rq1->lock);
-		__acquire(rq2->lock);	/* Fake it out ;) */
-	} else {
-		if (rq1 < rq2) {
-			raw_spin_lock(&rq1->lock);
-			raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
-		} else {
-			raw_spin_lock(&rq2->lock);
-			raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
-		}
-	}
-	update_rq_clock(rq1);
-	update_rq_clock(rq2);
+	return delta;
 }
 
+#ifdef CONFIG_NO_HZ
 /*
- * double_rq_unlock - safely unlock two runqueues
+ * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
  *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-	__releases(rq1->lock)
-	__releases(rq2->lock)
-{
-	raw_spin_unlock(&rq1->lock);
-	if (rq1 != rq2)
-		raw_spin_unlock(&rq2->lock);
-	else
-		__release(rq2->lock);
-}
-
-/*
- * sched_exec - execve() is a valuable balancing opportunity, because at
- * this point the task has the smallest effective memory and cache footprint.
- */
-void sched_exec(void)
-{
-	struct task_struct *p = current;
-	struct migration_req req;
-	int dest_cpu, this_cpu;
-	unsigned long flags;
-	struct rq *rq;
-
-again:
-	this_cpu = get_cpu();
-	dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
-	if (dest_cpu == this_cpu) {
-		put_cpu();
-		return;
-	}
-
-	rq = task_rq_lock(p, &flags);
-	put_cpu();
-
-	/*
-	 * select_task_rq() can race against ->cpus_allowed
-	 */
-	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
-	    || unlikely(!cpu_active(dest_cpu))) {
-		task_rq_unlock(rq, &flags);
-		goto again;
-	}
-
-	/* force the process onto the specified CPU */
-	if (migrate_task(p, dest_cpu, &req)) {
-		/* Need to wait for migration thread (might exit: take ref). */
-		struct task_struct *mt = rq->migration_thread;
-
-		get_task_struct(mt);
-		task_rq_unlock(rq, &flags);
-		wake_up_process(mt);
-		put_task_struct(mt);
-		wait_for_completion(&req.done);
-
-		return;
-	}
-	task_rq_unlock(rq, &flags);
-}
-
-/*
- * pull_task - move a task from a remote runqueue to the local runqueue.
- * Both runqueues must be locked.
+ * When making the ILB scale, we should try to pull this in as well.
  */
-static void pull_task(struct rq *src_rq, struct task_struct *p,
-		      struct rq *this_rq, int this_cpu)
-{
-	deactivate_task(src_rq, p, 0);
-	set_task_cpu(p, this_cpu);
-	activate_task(this_rq, p, 0);
-	check_preempt_curr(this_rq, p, 0);
-}
+static atomic_long_t calc_load_tasks_idle;
 
-/*
- * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
- */
-static
-int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
-		     struct sched_domain *sd, enum cpu_idle_type idle,
-		     int *all_pinned)
+static void calc_load_account_idle(struct rq *this_rq)
 {
-	int tsk_cache_hot = 0;
-	/*
-	 * We do not migrate tasks that are:
-	 * 1) running (obviously), or
-	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
-	 * 3) are cache-hot on their current CPU.
-	 */
-	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
-		schedstat_inc(p, se.nr_failed_migrations_affine);
-		return 0;
-	}
-	*all_pinned = 0;
-
-	if (task_running(rq, p)) {
-		schedstat_inc(p, se.nr_failed_migrations_running);
-		return 0;
-	}
+	long delta;
 
-	/*
-	 * Aggressive migration if:
-	 * 1) task is cache cold, or
-	 * 2) too many balance attempts have failed.
-	 */
-
-	tsk_cache_hot = task_hot(p, rq->clock, sd);
-	if (!tsk_cache_hot ||
-		sd->nr_balance_failed > sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
-		if (tsk_cache_hot) {
-			schedstat_inc(sd, lb_hot_gained[idle]);
-			schedstat_inc(p, se.nr_forced_migrations);
-		}
-#endif
-		return 1;
-	}
-
-	if (tsk_cache_hot) {
-		schedstat_inc(p, se.nr_failed_migrations_hot);
-		return 0;
-	}
-	return 1;
+	delta = calc_load_fold_active(this_rq);
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks_idle);
 }
 
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-	      unsigned long max_load_move, struct sched_domain *sd,
-	      enum cpu_idle_type idle, int *all_pinned,
-	      int *this_best_prio, struct rq_iterator *iterator)
+static long calc_load_fold_idle(void)
 {
-	int loops = 0, pulled = 0, pinned = 0;
-	struct task_struct *p;
-	long rem_load_move = max_load_move;
-
-	if (max_load_move == 0)
-		goto out;
-
-	pinned = 1;
-
-	/*
-	 * Start the load-balancing iterator:
-	 */
-	p = iterator->start(iterator->arg);
-next:
-	if (!p || loops++ > sysctl_sched_nr_migrate)
-		goto out;
-
-	if ((p->se.load.weight >> 1) > rem_load_move ||
-	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-		p = iterator->next(iterator->arg);
-		goto next;
-	}
-
-	pull_task(busiest, p, this_rq, this_cpu);
-	pulled++;
-	rem_load_move -= p->se.load.weight;
-
-#ifdef CONFIG_PREEMPT
-	/*
-	 * NEWIDLE balancing is a source of latency, so preemptible kernels
-	 * will stop after the first task is pulled to minimize the critical
-	 * section.
-	 */
-	if (idle == CPU_NEWLY_IDLE)
-		goto out;
-#endif
+	long delta = 0;
 
 	/*
-	 * We only want to steal up to the prescribed amount of weighted load.
-	 */
-	if (rem_load_move > 0) {
-		if (p->prio < *this_best_prio)
-			*this_best_prio = p->prio;
-		p = iterator->next(iterator->arg);
-		goto next;
-	}
-out:
-	/*
-	 * Right now, this is one of only two places pull_task() is called,
-	 * so we can safely collect pull_task() stats here rather than
-	 * inside pull_task().
+	 * Its got a race, we don't care...
 	 */
-	schedstat_add(sd, lb_gained[idle], pulled);
-
-	if (all_pinned)
-		*all_pinned = pinned;
+	if (atomic_long_read(&calc_load_tasks_idle))
+		delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
 
-	return max_load_move - rem_load_move;
+	return delta;
 }
-
-/*
- * move_tasks tries to move up to max_load_move weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned)
-{
-	const struct sched_class *class = sched_class_highest;
-	unsigned long total_load_moved = 0;
-	int this_best_prio = this_rq->curr->prio;
-
-	do {
-		total_load_moved +=
-			class->load_balance(this_rq, this_cpu, busiest,
-				max_load_move - total_load_moved,
-				sd, idle, all_pinned, &this_best_prio);
-		class = class->next;
-
-#ifdef CONFIG_PREEMPT
-		/*
-		 * NEWIDLE balancing is a source of latency, so preemptible
-		 * kernels will stop after the first task is pulled to minimize
-		 * the critical section.
-		 */
-		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
-			break;
-#endif
-	} while (class && max_load_move > total_load_moved);
-
-	return total_load_moved > 0;
-}
-
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		   struct sched_domain *sd, enum cpu_idle_type idle,
-		   struct rq_iterator *iterator)
+#else
+static void calc_load_account_idle(struct rq *this_rq)
 {
-	struct task_struct *p = iterator->start(iterator->arg);
-	int pinned = 0;
-
-	while (p) {
-		if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-			pull_task(busiest, p, this_rq, this_cpu);
-			/*
-			 * Right now, this is only the second place pull_task()
-			 * is called, so we can safely collect pull_task()
-			 * stats here rather than inside pull_task().
-			 */
-			schedstat_inc(sd, lb_gained[idle]);
-
-			return 1;
-		}
-		p = iterator->next(iterator->arg);
-	}
-
-	return 0;
 }
 
-/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
- * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-			 struct sched_domain *sd, enum cpu_idle_type idle)
+static inline long calc_load_fold_idle(void)
 {
-	const struct sched_class *class;
-
-	for_each_class(class) {
-		if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
-			return 1;
-	}
-
 	return 0;
 }
-/********** Helpers for find_busiest_group ************************/
-/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- * 		during load balancing.
- */
-struct sd_lb_stats {
-	struct sched_group *busiest; /* Busiest group in this sd */
-	struct sched_group *this;  /* Local group in this sd */
-	unsigned long total_load;  /* Total load of all groups in sd */
-	unsigned long total_pwr;   /*	Total power of all groups in sd */
-	unsigned long avg_load;	   /* Average load across all groups in sd */
-
-	/** Statistics of this group */
-	unsigned long this_load;
-	unsigned long this_load_per_task;
-	unsigned long this_nr_running;
-
-	/* Statistics of the busiest group */
-	unsigned long max_load;
-	unsigned long busiest_load_per_task;
-	unsigned long busiest_nr_running;
-
-	int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	int power_savings_balance; /* Is powersave balance needed for this sd */
-	struct sched_group *group_min; /* Least loaded group in sd */
-	struct sched_group *group_leader; /* Group which relieves group_min */
-	unsigned long min_load_per_task; /* load_per_task in group_min */
-	unsigned long leader_nr_running; /* Nr running of group_leader */
-	unsigned long min_nr_running; /* Nr running of group_min */
 #endif
-};
-
-/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
- */
-struct sg_lb_stats {
-	unsigned long avg_load; /*Avg load across the CPUs of the group */
-	unsigned long group_load; /* Total load over the CPUs of the group */
-	unsigned long sum_nr_running; /* Nr tasks running in the group */
-	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
-	unsigned long group_capacity;
-	int group_imb; /* Is there an imbalance in the group ? */
-};
-
-/**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
-	return cpumask_first(sched_group_cpus(group));
-}
-
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
-					enum cpu_idle_type idle)
-{
-	int load_idx;
-
-	switch (idle) {
-	case CPU_NOT_IDLE:
-		load_idx = sd->busy_idx;
-		break;
-
-	case CPU_NEWLY_IDLE:
-		load_idx = sd->newidle_idx;
-		break;
-	default:
-		load_idx = sd->idle_idx;
-		break;
-	}
-
-	return load_idx;
-}
-
-
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-	struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-	/*
-	 * Busy processors will not participate in power savings
-	 * balance.
-	 */
-	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-		sds->power_savings_balance = 0;
-	else {
-		sds->power_savings_balance = 1;
-		sds->min_nr_running = ULONG_MAX;
-		sds->leader_nr_running = 0;
-	}
-}
-
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- * 		load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-
-	if (!sds->power_savings_balance)
-		return;
-
-	/*
-	 * If the local group is idle or completely loaded
-	 * no need to do power savings balance at this domain
-	 */
-	if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
-				!sds->this_nr_running))
-		sds->power_savings_balance = 0;
-
-	/*
-	 * If a group is already running at full capacity or idle,
-	 * don't include that group in power savings calculations
-	 */
-	if (!sds->power_savings_balance ||
-		sgs->sum_nr_running >= sgs->group_capacity ||
-		!sgs->sum_nr_running)
-		return;
-
-	/*
-	 * Calculate the group which has the least non-idle load.
-	 * This is the group from where we need to pick up the load
-	 * for saving power
-	 */
-	if ((sgs->sum_nr_running < sds->min_nr_running) ||
-	    (sgs->sum_nr_running == sds->min_nr_running &&
-	     group_first_cpu(group) > group_first_cpu(sds->group_min))) {
-		sds->group_min = group;
-		sds->min_nr_running = sgs->sum_nr_running;
-		sds->min_load_per_task = sgs->sum_weighted_load /
-						sgs->sum_nr_running;
-	}
-
-	/*
-	 * Calculate the group which is almost near its
-	 * capacity but still has some space to pick up some load
-	 * from other group and save more power
-	 */
-	if (sgs->sum_nr_running + 1 > sgs->group_capacity)
-		return;
-
-	if (sgs->sum_nr_running > sds->leader_nr_running ||
-	    (sgs->sum_nr_running == sds->leader_nr_running &&
-	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
-		sds->group_leader = group;
-		sds->leader_nr_running = sgs->sum_nr_running;
-	}
-}
 
 /**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @sds: Variable containing the statistics of the sched_domain
- *	under consideration.
- * @this_cpu: Cpu at which we're currently performing load-balancing.
- * @imbalance: Variable to store the imbalance.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-					int this_cpu, unsigned long *imbalance)
-{
-	if (!sds->power_savings_balance)
-		return 0;
-
-	if (sds->this != sds->group_leader ||
-			sds->group_leader == sds->group_min)
-		return 0;
-
-	*imbalance = sds->min_load_per_task;
-	sds->busiest = sds->group_min;
-
-	return 1;
-
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-	struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-	return;
-}
-
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-	return;
-}
-
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-					int this_cpu, unsigned long *imbalance)
-{
-	return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
-
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-	return SCHED_LOAD_SCALE;
-}
-
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-	return default_scale_freq_power(sd, cpu);
-}
-
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-	unsigned long weight = cpumask_weight(sched_domain_span(sd));
-	unsigned long smt_gain = sd->smt_gain;
-
-	smt_gain /= weight;
-
-	return smt_gain;
-}
-
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-	return default_scale_smt_power(sd, cpu);
-}
-
-unsigned long scale_rt_power(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	u64 total, available;
-
-	sched_avg_update(rq);
-
-	total = sched_avg_period() + (rq->clock - rq->age_stamp);
-	available = total - rq->rt_avg;
-
-	if (unlikely((s64)total < SCHED_LOAD_SCALE))
-		total = SCHED_LOAD_SCALE;
-
-	total >>= SCHED_LOAD_SHIFT;
-
-	return div_u64(available, total);
-}
-
-static void update_cpu_power(struct sched_domain *sd, int cpu)
-{
-	unsigned long weight = cpumask_weight(sched_domain_span(sd));
-	unsigned long power = SCHED_LOAD_SCALE;
-	struct sched_group *sdg = sd->groups;
-
-	if (sched_feat(ARCH_POWER))
-		power *= arch_scale_freq_power(sd, cpu);
-	else
-		power *= default_scale_freq_power(sd, cpu);
-
-	power >>= SCHED_LOAD_SHIFT;
-
-	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-		if (sched_feat(ARCH_POWER))
-			power *= arch_scale_smt_power(sd, cpu);
-		else
-			power *= default_scale_smt_power(sd, cpu);
-
-		power >>= SCHED_LOAD_SHIFT;
-	}
-
-	power *= scale_rt_power(cpu);
-	power >>= SCHED_LOAD_SHIFT;
-
-	if (!power)
-		power = 1;
-
-	sdg->cpu_power = power;
-}
-
-static void update_group_power(struct sched_domain *sd, int cpu)
-{
-	struct sched_domain *child = sd->child;
-	struct sched_group *group, *sdg = sd->groups;
-	unsigned long power;
-
-	if (!child) {
-		update_cpu_power(sd, cpu);
-		return;
-	}
-
-	power = 0;
-
-	group = child->groups;
-	do {
-		power += group->cpu_power;
-		group = group->next;
-	} while (group != child->groups);
-
-	sdg->cpu_power = power;
-}
-
-/**
- * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
- * @group: sched_group whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
- * @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
- * @sgs: variable to hold the statistics for this group.
- */
-static inline void update_sg_lb_stats(struct sched_domain *sd,
-			struct sched_group *group, int this_cpu,
-			enum cpu_idle_type idle, int load_idx, int *sd_idle,
-			int local_group, const struct cpumask *cpus,
-			int *balance, struct sg_lb_stats *sgs)
-{
-	unsigned long load, max_cpu_load, min_cpu_load;
-	int i;
-	unsigned int balance_cpu = -1, first_idle_cpu = 0;
-	unsigned long sum_avg_load_per_task;
-	unsigned long avg_load_per_task;
-
-	if (local_group) {
-		balance_cpu = group_first_cpu(group);
-		if (balance_cpu == this_cpu)
-			update_group_power(sd, this_cpu);
-	}
-
-	/* Tally up the load of all CPUs in the group */
-	sum_avg_load_per_task = avg_load_per_task = 0;
-	max_cpu_load = 0;
-	min_cpu_load = ~0UL;
-
-	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
-		struct rq *rq = cpu_rq(i);
-
-		if (*sd_idle && rq->nr_running)
-			*sd_idle = 0;
-
-		/* Bias balancing toward cpus of our domain */
-		if (local_group) {
-			if (idle_cpu(i) && !first_idle_cpu) {
-				first_idle_cpu = 1;
-				balance_cpu = i;
-			}
-
-			load = target_load(i, load_idx);
-		} else {
-			load = source_load(i, load_idx);
-			if (load > max_cpu_load)
-				max_cpu_load = load;
-			if (min_cpu_load > load)
-				min_cpu_load = load;
-		}
-
-		sgs->group_load += load;
-		sgs->sum_nr_running += rq->nr_running;
-		sgs->sum_weighted_load += weighted_cpuload(i);
-
-		sum_avg_load_per_task += cpu_avg_load_per_task(i);
-	}
-
-	/*
-	 * First idle cpu or the first cpu(busiest) in this sched group
-	 * is eligible for doing load balancing at this and above
-	 * domains. In the newly idle case, we will allow all the cpu's
-	 * to do the newly idle load balance.
-	 */
-	if (idle != CPU_NEWLY_IDLE && local_group &&
-	    balance_cpu != this_cpu && balance) {
-		*balance = 0;
-		return;
-	}
-
-	/* Adjust by relative CPU power of the group */
-	sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
-
-
-	/*
-	 * Consider the group unbalanced when the imbalance is larger
-	 * than the average weight of two tasks.
-	 *
-	 * APZ: with cgroup the avg task weight can vary wildly and
-	 *      might not be a suitable number - should we keep a
-	 *      normalized nr_running number somewhere that negates
-	 *      the hierarchy?
-	 */
-	avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
-		group->cpu_power;
-
-	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
-		sgs->group_imb = 1;
-
-	sgs->group_capacity =
-		DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
-}
-
-/**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
- * @sds: variable to hold the statistics for this sched_domain.
- */
-static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
-			enum cpu_idle_type idle, int *sd_idle,
-			const struct cpumask *cpus, int *balance,
-			struct sd_lb_stats *sds)
-{
-	struct sched_domain *child = sd->child;
-	struct sched_group *group = sd->groups;
-	struct sg_lb_stats sgs;
-	int load_idx, prefer_sibling = 0;
-
-	if (child && child->flags & SD_PREFER_SIBLING)
-		prefer_sibling = 1;
-
-	init_sd_power_savings_stats(sd, sds, idle);
-	load_idx = get_sd_load_idx(sd, idle);
-
-	do {
-		int local_group;
-
-		local_group = cpumask_test_cpu(this_cpu,
-					       sched_group_cpus(group));
-		memset(&sgs, 0, sizeof(sgs));
-		update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
-				local_group, cpus, balance, &sgs);
-
-		if (local_group && balance && !(*balance))
-			return;
-
-		sds->total_load += sgs.group_load;
-		sds->total_pwr += group->cpu_power;
-
-		/*
-		 * In case the child domain prefers tasks go to siblings
-		 * first, lower the group capacity to one so that we'll try
-		 * and move all the excess tasks away.
-		 */
-		if (prefer_sibling)
-			sgs.group_capacity = min(sgs.group_capacity, 1UL);
-
-		if (local_group) {
-			sds->this_load = sgs.avg_load;
-			sds->this = group;
-			sds->this_nr_running = sgs.sum_nr_running;
-			sds->this_load_per_task = sgs.sum_weighted_load;
-		} else if (sgs.avg_load > sds->max_load &&
-			   (sgs.sum_nr_running > sgs.group_capacity ||
-				sgs.group_imb)) {
-			sds->max_load = sgs.avg_load;
-			sds->busiest = group;
-			sds->busiest_nr_running = sgs.sum_nr_running;
-			sds->busiest_load_per_task = sgs.sum_weighted_load;
-			sds->group_imb = sgs.group_imb;
-		}
-
-		update_sd_power_savings_stats(group, sds, local_group, &sgs);
-		group = group->next;
-	} while (group != sd->groups);
-}
-
-/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- *			amongst the groups of a sched_domain, during
- *			load balancing.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
- */
-static inline void fix_small_imbalance(struct sd_lb_stats *sds,
-				int this_cpu, unsigned long *imbalance)
-{
-	unsigned long tmp, pwr_now = 0, pwr_move = 0;
-	unsigned int imbn = 2;
-
-	if (sds->this_nr_running) {
-		sds->this_load_per_task /= sds->this_nr_running;
-		if (sds->busiest_load_per_task >
-				sds->this_load_per_task)
-			imbn = 1;
-	} else
-		sds->this_load_per_task =
-			cpu_avg_load_per_task(this_cpu);
-
-	if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
-			sds->busiest_load_per_task * imbn) {
-		*imbalance = sds->busiest_load_per_task;
-		return;
-	}
-
-	/*
-	 * OK, we don't have enough imbalance to justify moving tasks,
-	 * however we may be able to increase total CPU power used by
-	 * moving them.
-	 */
-
-	pwr_now += sds->busiest->cpu_power *
-			min(sds->busiest_load_per_task, sds->max_load);
-	pwr_now += sds->this->cpu_power *
-			min(sds->this_load_per_task, sds->this_load);
-	pwr_now /= SCHED_LOAD_SCALE;
-
-	/* Amount of load we'd subtract */
-	tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
-		sds->busiest->cpu_power;
-	if (sds->max_load > tmp)
-		pwr_move += sds->busiest->cpu_power *
-			min(sds->busiest_load_per_task, sds->max_load - tmp);
-
-	/* Amount of load we'd add */
-	if (sds->max_load * sds->busiest->cpu_power <
-		sds->busiest_load_per_task * SCHED_LOAD_SCALE)
-		tmp = (sds->max_load * sds->busiest->cpu_power) /
-			sds->this->cpu_power;
-	else
-		tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
-			sds->this->cpu_power;
-	pwr_move += sds->this->cpu_power *
-			min(sds->this_load_per_task, sds->this_load + tmp);
-	pwr_move /= SCHED_LOAD_SCALE;
-
-	/* Move if we gain throughput */
-	if (pwr_move > pwr_now)
-		*imbalance = sds->busiest_load_per_task;
-}
-
-/**
- * calculate_imbalance - Calculate the amount of imbalance present within the
- *			 groups of a given sched_domain during load balance.
- * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: Cpu for which currently load balance is being performed.
- * @imbalance: The variable to store the imbalance.
- */
-static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
-		unsigned long *imbalance)
-{
-	unsigned long max_pull;
-	/*
-	 * In the presence of smp nice balancing, certain scenarios can have
-	 * max load less than avg load(as we skip the groups at or below
-	 * its cpu_power, while calculating max_load..)
-	 */
-	if (sds->max_load < sds->avg_load) {
-		*imbalance = 0;
-		return fix_small_imbalance(sds, this_cpu, imbalance);
-	}
-
-	/* Don't want to pull so many tasks that a group would go idle */
-	max_pull = min(sds->max_load - sds->avg_load,
-			sds->max_load - sds->busiest_load_per_task);
-
-	/* How much load to actually move to equalise the imbalance */
-	*imbalance = min(max_pull * sds->busiest->cpu_power,
-		(sds->avg_load - sds->this_load) * sds->this->cpu_power)
-			/ SCHED_LOAD_SCALE;
-
-	/*
-	 * if *imbalance is less than the average load per runnable task
-	 * there is no gaurantee that any tasks will be moved so we'll have
-	 * a think about bumping its value to force at least one task to be
-	 * moved
-	 */
-	if (*imbalance < sds->busiest_load_per_task)
-		return fix_small_imbalance(sds, this_cpu, imbalance);
-
-}
-/******* find_busiest_group() helpers end here *********************/
-
-/**
- * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
- *
- * Also calculates the amount of weighted load which should be moved
- * to restore balance.
- *
- * @sd: The sched_domain whose busiest group is to be returned.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- *		be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
- * @cpus: The set of CPUs under consideration for load-balancing.
- * @balance: Pointer to a variable indicating if this_cpu
- *	is the appropriate cpu to perform load balancing at this_level.
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
  *
- * Returns:	- the busiest group if imbalance exists.
- *		- If no imbalance and user has opted for power-savings balance,
- *		   return the least loaded group whose CPUs can be
- *		   put to idle by rebalancing its tasks onto our group.
- */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum cpu_idle_type idle,
-		   int *sd_idle, const struct cpumask *cpus, int *balance)
-{
-	struct sd_lb_stats sds;
-
-	memset(&sds, 0, sizeof(sds));
-
-	/*
-	 * Compute the various statistics relavent for load balancing at
-	 * this level.
-	 */
-	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
-					balance, &sds);
-
-	/* Cases where imbalance does not exist from POV of this_cpu */
-	/* 1) this_cpu is not the appropriate cpu to perform load balancing
-	 *    at this level.
-	 * 2) There is no busy sibling group to pull from.
-	 * 3) This group is the busiest group.
-	 * 4) This group is more busy than the avg busieness at this
-	 *    sched_domain.
-	 * 5) The imbalance is within the specified limit.
-	 * 6) Any rebalance would lead to ping-pong
-	 */
-	if (balance && !(*balance))
-		goto ret;
-
-	if (!sds.busiest || sds.busiest_nr_running == 0)
-		goto out_balanced;
-
-	if (sds.this_load >= sds.max_load)
-		goto out_balanced;
-
-	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-
-	if (sds.this_load >= sds.avg_load)
-		goto out_balanced;
-
-	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-		goto out_balanced;
-
-	sds.busiest_load_per_task /= sds.busiest_nr_running;
-	if (sds.group_imb)
-		sds.busiest_load_per_task =
-			min(sds.busiest_load_per_task, sds.avg_load);
-
-	/*
-	 * We're trying to get all the cpus to the average_load, so we don't
-	 * want to push ourselves above the average load, nor do we wish to
-	 * reduce the max loaded cpu below the average load, as either of these
-	 * actions would just result in more rebalancing later, and ping-pong
-	 * tasks around. Thus we look for the minimum possible imbalance.
-	 * Negative imbalances (*we* are more loaded than anyone else) will
-	 * be counted as no imbalance for these purposes -- we can't fix that
-	 * by pulling tasks to us. Be careful of negative numbers as they'll
-	 * appear as very large values with unsigned longs.
-	 */
-	if (sds.max_load <= sds.busiest_load_per_task)
-		goto out_balanced;
-
-	/* Looks like there is an imbalance. Compute it */
-	calculate_imbalance(&sds, this_cpu, imbalance);
-	return sds.busiest;
-
-out_balanced:
-	/*
-	 * There is no obvious imbalance. But check if we can do some balancing
-	 * to save power.
-	 */
-	if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
-		return sds.busiest;
-ret:
-	*imbalance = 0;
-	return NULL;
-}
-
-/*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
- */
-static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-		   unsigned long imbalance, const struct cpumask *cpus)
-{
-	struct rq *busiest = NULL, *rq;
-	unsigned long max_load = 0;
-	int i;
-
-	for_each_cpu(i, sched_group_cpus(group)) {
-		unsigned long power = power_of(i);
-		unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
-		unsigned long wl;
-
-		if (!cpumask_test_cpu(i, cpus))
-			continue;
-
-		rq = cpu_rq(i);
-		wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
-		wl /= power;
-
-		if (capacity && rq->nr_running == 1 && wl > imbalance)
-			continue;
-
-		if (wl > max_load) {
-			max_load = wl;
-			busiest = rq;
-		}
-	}
-
-	return busiest;
-}
-
-/*
- * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
- * so long as it is large enough.
- */
-#define MAX_PINNED_INTERVAL	512
-
-/* Working cpumask for load_balance and load_balance_newidle. */
-static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
+ * These values are estimates at best, so no need for locking.
  */
-static int load_balance(int this_cpu, struct rq *this_rq,
-			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *balance)
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 {
-	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
-	struct sched_group *group;
-	unsigned long imbalance;
-	struct rq *busiest;
-	unsigned long flags;
-	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-
-	cpumask_copy(cpus, cpu_active_mask);
-
-	/*
-	 * When power savings policy is enabled for the parent domain, idle
-	 * sibling can pick up load irrespective of busy siblings. In this case,
-	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
-	 * portraying it as CPU_NOT_IDLE.
-	 */
-	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		sd_idle = 1;
-
-	schedstat_inc(sd, lb_count[idle]);
-
-redo:
-	update_shares(sd);
-	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
-				   cpus, balance);
-
-	if (*balance == 0)
-		goto out_balanced;
-
-	if (!group) {
-		schedstat_inc(sd, lb_nobusyg[idle]);
-		goto out_balanced;
-	}
-
-	busiest = find_busiest_queue(group, idle, imbalance, cpus);
-	if (!busiest) {
-		schedstat_inc(sd, lb_nobusyq[idle]);
-		goto out_balanced;
-	}
-
-	BUG_ON(busiest == this_rq);
-
-	schedstat_add(sd, lb_imbalance[idle], imbalance);
-
-	ld_moved = 0;
-	if (busiest->nr_running > 1) {
-		/*
-		 * Attempt to move tasks. If find_busiest_group has found
-		 * an imbalance but busiest->nr_running <= 1, the group is
-		 * still unbalanced. ld_moved simply stays zero, so it is
-		 * correctly treated as an imbalance.
-		 */
-		local_irq_save(flags);
-		double_rq_lock(this_rq, busiest);
-		ld_moved = move_tasks(this_rq, this_cpu, busiest,
-				      imbalance, sd, idle, &all_pinned);
-		double_rq_unlock(this_rq, busiest);
-		local_irq_restore(flags);
-
-		/*
-		 * some other cpu did the load balance for us.
-		 */
-		if (ld_moved && this_cpu != smp_processor_id())
-			resched_cpu(this_cpu);
-
-		/* All tasks on this runqueue were pinned by CPU affinity */
-		if (unlikely(all_pinned)) {
-			cpumask_clear_cpu(cpu_of(busiest), cpus);
-			if (!cpumask_empty(cpus))
-				goto redo;
-			goto out_balanced;
-		}
-	}
-
-	if (!ld_moved) {
-		schedstat_inc(sd, lb_failed[idle]);
-		sd->nr_balance_failed++;
-
-		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-
-			raw_spin_lock_irqsave(&busiest->lock, flags);
-
-			/* don't kick the migration_thread, if the curr
-			 * task on busiest cpu can't be moved to this_cpu
-			 */
-			if (!cpumask_test_cpu(this_cpu,
-					      &busiest->curr->cpus_allowed)) {
-				raw_spin_unlock_irqrestore(&busiest->lock,
-							    flags);
-				all_pinned = 1;
-				goto out_one_pinned;
-			}
-
-			if (!busiest->active_balance) {
-				busiest->active_balance = 1;
-				busiest->push_cpu = this_cpu;
-				active_balance = 1;
-			}
-			raw_spin_unlock_irqrestore(&busiest->lock, flags);
-			if (active_balance)
-				wake_up_process(busiest->migration_thread);
-
-			/*
-			 * We've kicked active balancing, reset the failure
-			 * counter.
-			 */
-			sd->nr_balance_failed = sd->cache_nice_tries+1;
-		}
-	} else
-		sd->nr_balance_failed = 0;
-
-	if (likely(!active_balance)) {
-		/* We were unbalanced, so reset the balancing interval */
-		sd->balance_interval = sd->min_interval;
-	} else {
-		/*
-		 * If we've begun active balancing, start to back off. This
-		 * case may not be covered by the all_pinned logic if there
-		 * is only 1 task on the busy runqueue (because we don't call
-		 * move_tasks).
-		 */
-		if (sd->balance_interval < sd->max_interval)
-			sd->balance_interval *= 2;
-	}
-
-	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		ld_moved = -1;
-
-	goto out;
-
-out_balanced:
-	schedstat_inc(sd, lb_balanced[idle]);
-
-	sd->nr_balance_failed = 0;
-
-out_one_pinned:
-	/* tune up the balancing interval */
-	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
-			(sd->balance_interval < sd->max_interval))
-		sd->balance_interval *= 2;
-
-	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		ld_moved = -1;
-	else
-		ld_moved = 0;
-out:
-	if (ld_moved)
-		update_shares(sd);
-	return ld_moved;
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
 }
 
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- *
- * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
- * this_rq is locked.
- */
-static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
-	struct sched_group *group;
-	struct rq *busiest = NULL;
-	unsigned long imbalance;
-	int ld_moved = 0;
-	int sd_idle = 0;
-	int all_pinned = 0;
-	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-
-	cpumask_copy(cpus, cpu_active_mask);
-
-	/*
-	 * When power savings policy is enabled for the parent domain, idle
-	 * sibling can pick up load irrespective of busy siblings. In this case,
-	 * let the state of idle sibling percolate up as IDLE, instead of
-	 * portraying it as CPU_NOT_IDLE.
-	 */
-	if (sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		sd_idle = 1;
-
-	schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
-redo:
-	update_shares_locked(this_rq, sd);
-	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
-				   &sd_idle, cpus, NULL);
-	if (!group) {
-		schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
-		goto out_balanced;
-	}
-
-	busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
-	if (!busiest) {
-		schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
-		goto out_balanced;
-	}
-
-	BUG_ON(busiest == this_rq);
-
-	schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
-
-	ld_moved = 0;
-	if (busiest->nr_running > 1) {
-		/* Attempt to move tasks */
-		double_lock_balance(this_rq, busiest);
-		/* this_rq->clock is already updated */
-		update_rq_clock(busiest);
-		ld_moved = move_tasks(this_rq, this_cpu, busiest,
-					imbalance, sd, CPU_NEWLY_IDLE,
-					&all_pinned);
-		double_unlock_balance(this_rq, busiest);
-
-		if (unlikely(all_pinned)) {
-			cpumask_clear_cpu(cpu_of(busiest), cpus);
-			if (!cpumask_empty(cpus))
-				goto redo;
-		}
-	}
-
-	if (!ld_moved) {
-		int active_balance = 0;
-
-		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
-		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-			return -1;
-
-		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
-			return -1;
-
-		if (sd->nr_balance_failed++ < 2)
-			return -1;
-
-		/*
-		 * The only task running in a non-idle cpu can be moved to this
-		 * cpu in an attempt to completely freeup the other CPU
-		 * package. The same method used to move task in load_balance()
-		 * have been extended for load_balance_newidle() to speedup
-		 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
-		 *
-		 * The package power saving logic comes from
-		 * find_busiest_group().  If there are no imbalance, then
-		 * f_b_g() will return NULL.  However when sched_mc={1,2} then
-		 * f_b_g() will select a group from which a running task may be
-		 * pulled to this cpu in order to make the other package idle.
-		 * If there is no opportunity to make a package idle and if
-		 * there are no imbalance, then f_b_g() will return NULL and no
-		 * action will be taken in load_balance_newidle().
-		 *
-		 * Under normal task pull operation due to imbalance, there
-		 * will be more than one task in the source run queue and
-		 * move_tasks() will succeed.  ld_moved will be true and this
-		 * active balance code will not be triggered.
-		 */
-
-		/* Lock busiest in correct order while this_rq is held */
-		double_lock_balance(this_rq, busiest);
-
-		/*
-		 * don't kick the migration_thread, if the curr
-		 * task on busiest cpu can't be moved to this_cpu
-		 */
-		if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
-			double_unlock_balance(this_rq, busiest);
-			all_pinned = 1;
-			return ld_moved;
-		}
-
-		if (!busiest->active_balance) {
-			busiest->active_balance = 1;
-			busiest->push_cpu = this_cpu;
-			active_balance = 1;
-		}
-
-		double_unlock_balance(this_rq, busiest);
-		/*
-		 * Should not call ttwu while holding a rq->lock
-		 */
-		raw_spin_unlock(&this_rq->lock);
-		if (active_balance)
-			wake_up_process(busiest->migration_thread);
-		raw_spin_lock(&this_rq->lock);
-
-	} else
-		sd->nr_balance_failed = 0;
-
-	update_shares_locked(this_rq, sd);
-	return ld_moved;
-
-out_balanced:
-	schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
-	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		return -1;
-	sd->nr_balance_failed = 0;
-
-	return 0;
+	load *= exp;
+	load += active * (FIXED_1 - exp);
+	return load >> FSHIFT;
 }
 
 /*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
  */
-static void idle_balance(int this_cpu, struct rq *this_rq)
+void calc_global_load(void)
 {
-	struct sched_domain *sd;
-	int pulled_task = 0;
-	unsigned long next_balance = jiffies + HZ;
-
-	this_rq->idle_stamp = this_rq->clock;
+	unsigned long upd = calc_load_update + 10;
+	long active;
 
-	if (this_rq->avg_idle < sysctl_sched_migration_cost)
+	if (time_before(jiffies, upd))
 		return;
 
-	for_each_domain(this_cpu, sd) {
-		unsigned long interval;
-
-		if (!(sd->flags & SD_LOAD_BALANCE))
-			continue;
+	active = atomic_long_read(&calc_load_tasks);
+	active = active > 0 ? active * FIXED_1 : 0;
 
-		if (sd->flags & SD_BALANCE_NEWIDLE)
-			/* If we've pulled tasks over stop searching: */
-			pulled_task = load_balance_newidle(this_cpu, this_rq,
-							   sd);
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
 
-		interval = msecs_to_jiffies(sd->balance_interval);
-		if (time_after(next_balance, sd->last_balance + interval))
-			next_balance = sd->last_balance + interval;
-		if (pulled_task) {
-			this_rq->idle_stamp = 0;
-			break;
-		}
-	}
-	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
-		/*
-		 * We are going idle. next_balance may be set based on
-		 * a busy processor. So reset next_balance.
-		 */
-		this_rq->next_balance = next_balance;
-	}
+	calc_load_update += LOAD_FREQ;
 }
 
 /*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
  */
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+static void calc_load_account_active(struct rq *this_rq)
 {
-	int target_cpu = busiest_rq->push_cpu;
-	struct sched_domain *sd;
-	struct rq *target_rq;
+	long delta;
 
-	/* Is there any task to move? */
-	if (busiest_rq->nr_running <= 1)
+	if (time_before(jiffies, this_rq->calc_load_update))
 		return;
 
-	target_rq = cpu_rq(target_cpu);
-
-	/*
-	 * This condition is "impossible", if it occurs
-	 * we need to fix it. Originally reported by
-	 * Bjorn Helgaas on a 128-cpu setup.
-	 */
-	BUG_ON(busiest_rq == target_rq);
-
-	/* move a task from busiest_rq to target_rq */
-	double_lock_balance(busiest_rq, target_rq);
-	update_rq_clock(busiest_rq);
-	update_rq_clock(target_rq);
-
-	/* Search for an sd spanning us and the target CPU. */
-	for_each_domain(target_cpu, sd) {
-		if ((sd->flags & SD_LOAD_BALANCE) &&
-		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
-				break;
-	}
-
-	if (likely(sd)) {
-		schedstat_inc(sd, alb_count);
-
-		if (move_one_task(target_rq, target_cpu, busiest_rq,
-				  sd, CPU_IDLE))
-			schedstat_inc(sd, alb_pushed);
-		else
-			schedstat_inc(sd, alb_failed);
-	}
-	double_unlock_balance(busiest_rq, target_rq);
-}
-
-#ifdef CONFIG_NO_HZ
-static struct {
-	atomic_t load_balancer;
-	cpumask_var_t cpu_mask;
-	cpumask_var_t ilb_grp_nohz_mask;
-} nohz ____cacheline_aligned = {
-	.load_balancer = ATOMIC_INIT(-1),
-};
+	delta  = calc_load_fold_active(this_rq);
+	delta += calc_load_fold_idle();
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
 
-int get_nohz_load_balancer(void)
-{
-	return atomic_read(&nohz.load_balancer);
+	this_rq->calc_load_update += LOAD_FREQ;
 }
 
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu:	The cpu whose lowest level of sched domain is to
- *		be returned.
- * @flag:	The flag to check for the lowest sched_domain
- *		for the given cpu.
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
  *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
-	struct sched_domain *sd;
-
-	for_each_domain(cpu, sd)
-		if (sd && (sd->flags & flag))
-			break;
-
-	return sd;
-}
-
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu:	The cpu whose domains we're iterating over.
- * @sd:		variable holding the value of the power_savings_sd
- *		for cpu.
- * @flag:	The flag to filter the sched_domains to be iterated.
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
  *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
-	for (sd = lowest_flag_domain(cpu, flag); \
-		(sd && (sd->flags & flag)); sd = sd->parent)
-
-/**
- * is_semi_idle_group - Checks if the given sched_group is semi-idle.
- * @ilb_group:	group to be checked for semi-idleness
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
  *
- * Returns:	1 if the group is semi-idle. 0 otherwise.
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
  *
- * We define a sched_group to be semi idle if it has atleast one idle-CPU
- * and atleast one non-idle CPU. This helper function checks if the given
- * sched_group is semi-idle or not.
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT		7
+static const unsigned char
+		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+					{0, 0, 0, 0, 0, 0, 0, 0},
+					{64, 32, 8, 0, 0, 0, 0, 0},
+					{96, 72, 40, 12, 1, 0, 0},
+					{112, 98, 75, 43, 15, 1, 0},
+					{120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
  */
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 {
-	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
-					sched_group_cpus(ilb_group));
+	int j = 0;
 
-	/*
-	 * A sched_group is semi-idle when it has atleast one busy cpu
-	 * and atleast one idle cpu.
-	 */
-	if (cpumask_empty(nohz.ilb_grp_nohz_mask))
-		return 0;
+	if (!missed_updates)
+		return load;
 
-	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+	if (missed_updates >= degrade_zero_ticks[idx])
 		return 0;
 
-	return 1;
-}
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu:	The cpu which is nominating a new idle_load_balancer.
- *
- * Returns:	Returns the id of the idle load balancer if it exists,
- *		Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
-{
-	struct sched_domain *sd;
-	struct sched_group *ilb_group;
+	if (idx == 1)
+		return load >> missed_updates;
 
-	/*
-	 * Have idle load balancer selection from semi-idle packages only
-	 * when power-aware load balancing is enabled
-	 */
-	if (!(sched_smt_power_savings || sched_mc_power_savings))
-		goto out_done;
-
-	/*
-	 * Optimize for the case when we have no idle CPUs or only one
-	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
-	 */
-	if (cpumask_weight(nohz.cpu_mask) < 2)
-		goto out_done;
+	while (missed_updates) {
+		if (missed_updates % 2)
+			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
 
-	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
-		ilb_group = sd->groups;
-
-		do {
-			if (is_semi_idle_group(ilb_group))
-				return cpumask_first(nohz.ilb_grp_nohz_mask);
-
-			ilb_group = ilb_group->next;
-
-		} while (ilb_group != sd->groups);
+		missed_updates >>= 1;
+		j++;
 	}
-
-out_done:
-	return cpumask_first(nohz.cpu_mask);
-}
-#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
-	return cpumask_first(nohz.cpu_mask);
+	return load;
 }
-#endif
 
 /*
- * This routine will try to nominate the ilb (idle load balancing)
- * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
- *
- * While stopping the tick, this cpu will become the ilb owner if there
- * is no other owner. And will be the owner till that cpu becomes busy
- * or if all cpus in the system stop their ticks at which point
- * there is no need for ilb owner.
- *
- * When the ilb owner becomes busy, it nominates another owner, during the
- * next busy scheduler_tick()
- */
-int select_nohz_load_balancer(int stop_tick)
-{
-	int cpu = smp_processor_id();
-
-	if (stop_tick) {
-		cpu_rq(cpu)->in_nohz_recently = 1;
-
-		if (!cpu_active(cpu)) {
-			if (atomic_read(&nohz.load_balancer) != cpu)
-				return 0;
-
-			/*
-			 * If we are going offline and still the leader,
-			 * give up!
-			 */
-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
-				BUG();
-
-			return 0;
-		}
-
-		cpumask_set_cpu(cpu, nohz.cpu_mask);
-
-		/* time for ilb owner also to sleep */
-		if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
-			if (atomic_read(&nohz.load_balancer) == cpu)
-				atomic_set(&nohz.load_balancer, -1);
-			return 0;
-		}
-
-		if (atomic_read(&nohz.load_balancer) == -1) {
-			/* make me the ilb owner */
-			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
-				return 1;
-		} else if (atomic_read(&nohz.load_balancer) == cpu) {
-			int new_ilb;
-
-			if (!(sched_smt_power_savings ||
-						sched_mc_power_savings))
-				return 1;
-			/*
-			 * Check to see if there is a more power-efficient
-			 * ilb.
-			 */
-			new_ilb = find_new_ilb(cpu);
-			if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-				atomic_set(&nohz.load_balancer, -1);
-				resched_cpu(new_ilb);
-				return 0;
-			}
-			return 1;
-		}
-	} else {
-		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
-			return 0;
-
-		cpumask_clear_cpu(cpu, nohz.cpu_mask);
-
-		if (atomic_read(&nohz.load_balancer) == cpu)
-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
-				BUG();
-	}
-	return 0;
-}
-#endif
-
-static DEFINE_SPINLOCK(balancing);
-
-/*
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
  */
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+static void update_cpu_load(struct rq *this_rq)
 {
-	int balance = 1;
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long interval;
-	struct sched_domain *sd;
-	/* Earliest time when we have to do rebalance again */
-	unsigned long next_balance = jiffies + 60*HZ;
-	int update_next_balance = 0;
-	int need_serialize;
-
-	for_each_domain(cpu, sd) {
-		if (!(sd->flags & SD_LOAD_BALANCE))
-			continue;
+	unsigned long this_load = this_rq->load.weight;
+	unsigned long curr_jiffies = jiffies;
+	unsigned long pending_updates;
+	int i, scale;
 
-		interval = sd->balance_interval;
-		if (idle != CPU_IDLE)
-			interval *= sd->busy_factor;
+	this_rq->nr_load_updates++;
 
-		/* scale ms to jiffies */
-		interval = msecs_to_jiffies(interval);
-		if (unlikely(!interval))
-			interval = 1;
-		if (interval > HZ*NR_CPUS/10)
-			interval = HZ*NR_CPUS/10;
+	/* Avoid repeated calls on same jiffy, when moving in and out of idle */
+	if (curr_jiffies == this_rq->last_load_update_tick)
+		return;
 
-		need_serialize = sd->flags & SD_SERIALIZE;
+	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+	this_rq->last_load_update_tick = curr_jiffies;
 
-		if (need_serialize) {
-			if (!spin_trylock(&balancing))
-				goto out;
-		}
+	/* Update our load: */
+	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+		unsigned long old_load, new_load;
 
-		if (time_after_eq(jiffies, sd->last_balance + interval)) {
-			if (load_balance(cpu, rq, sd, idle, &balance)) {
-				/*
-				 * We've pulled tasks over so either we're no
-				 * longer idle, or one of our SMT siblings is
-				 * not idle.
-				 */
-				idle = CPU_NOT_IDLE;
-			}
-			sd->last_balance = jiffies;
-		}
-		if (need_serialize)
-			spin_unlock(&balancing);
-out:
-		if (time_after(next_balance, sd->last_balance + interval)) {
-			next_balance = sd->last_balance + interval;
-			update_next_balance = 1;
-		}
+		/* scale is effectively 1 << i now, and >> i divides by scale */
 
+		old_load = this_rq->cpu_load[i];
+		old_load = decay_load_missed(old_load, pending_updates - 1, i);
+		new_load = this_load;
 		/*
-		 * Stop the load balance at this level. There is another
-		 * CPU in our sched group which is doing load balancing more
-		 * actively.
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
 		 */
-		if (!balance)
-			break;
+		if (new_load > old_load)
+			new_load += scale - 1;
+
+		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
 	}
 
-	/*
-	 * next_balance will be updated only when there is a need.
-	 * When the cpu is attached to null domain for ex, it will not be
-	 * updated.
-	 */
-	if (likely(update_next_balance))
-		rq->next_balance = next_balance;
+	sched_avg_update(this_rq);
 }
 
-/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
- */
-static void run_rebalance_domains(struct softirq_action *h)
+static void update_cpu_load_active(struct rq *this_rq)
 {
-	int this_cpu = smp_processor_id();
-	struct rq *this_rq = cpu_rq(this_cpu);
-	enum cpu_idle_type idle = this_rq->idle_at_tick ?
-						CPU_IDLE : CPU_NOT_IDLE;
-
-	rebalance_domains(this_cpu, idle);
-
-#ifdef CONFIG_NO_HZ
-	/*
-	 * If this cpu is the owner for idle load balancing, then do the
-	 * balancing on behalf of the other idle cpus whose ticks are
-	 * stopped.
-	 */
-	if (this_rq->idle_at_tick &&
-	    atomic_read(&nohz.load_balancer) == this_cpu) {
-		struct rq *rq;
-		int balance_cpu;
-
-		for_each_cpu(balance_cpu, nohz.cpu_mask) {
-			if (balance_cpu == this_cpu)
-				continue;
-
-			/*
-			 * If this cpu gets work to do, stop the load balancing
-			 * work being done for other cpus. Next load
-			 * balancing owner will pick it up.
-			 */
-			if (need_resched())
-				break;
+	update_cpu_load(this_rq);
 
-			rebalance_domains(balance_cpu, CPU_IDLE);
-
-			rq = cpu_rq(balance_cpu);
-			if (time_after(this_rq->next_balance, rq->next_balance))
-				this_rq->next_balance = rq->next_balance;
-		}
-	}
-#endif
+	calc_load_account_active(this_rq);
 }
 
-static inline int on_null_domain(int cpu)
-{
-	return !rcu_dereference(cpu_rq(cpu)->sd);
-}
+#ifdef CONFIG_SMP
 
 /*
- * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
  */
-static inline void trigger_load_balance(struct rq *rq, int cpu)
+void sched_exec(void)
 {
-#ifdef CONFIG_NO_HZ
-	/*
-	 * If we were in the nohz mode recently and busy at the current
-	 * scheduler tick, then check if we need to nominate new idle
-	 * load balancer.
-	 */
-	if (rq->in_nohz_recently && !rq->idle_at_tick) {
-		rq->in_nohz_recently = 0;
-
-		if (atomic_read(&nohz.load_balancer) == cpu) {
-			cpumask_clear_cpu(cpu, nohz.cpu_mask);
-			atomic_set(&nohz.load_balancer, -1);
-		}
-
-		if (atomic_read(&nohz.load_balancer) == -1) {
-			int ilb = find_new_ilb(cpu);
+	struct task_struct *p = current;
+	unsigned long flags;
+	struct rq *rq;
+	int dest_cpu;
 
-			if (ilb < nr_cpu_ids)
-				resched_cpu(ilb);
-		}
-	}
+	rq = task_rq_lock(p, &flags);
+	dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+	if (dest_cpu == smp_processor_id())
+		goto unlock;
 
 	/*
-	 * If this cpu is idle and doing idle load balancing for all the
-	 * cpus with ticks stopped, is it time for that to stop?
+	 * select_task_rq() can race against ->cpus_allowed
 	 */
-	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
-		resched_cpu(cpu);
-		return;
-	}
+	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
+	    likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+		struct migration_arg arg = { p, dest_cpu };
 
-	/*
-	 * If this cpu is idle and the idle load balancing is done by
-	 * someone else, then no need raise the SCHED_SOFTIRQ
-	 */
-	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-	    cpumask_test_cpu(cpu, nohz.cpu_mask))
+		task_rq_unlock(rq, &flags);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		return;
-#endif
-	/* Don't need to rebalance while attached to NULL domain */
-	if (time_after_eq(jiffies, rq->next_balance) &&
-	    likely(!on_null_domain(cpu)))
-		raise_softirq(SCHED_SOFTIRQ);
-}
-
-#else	/* CONFIG_SMP */
-
-/*
- * on UP we do not need to balance between CPUs:
- */
-static inline void idle_balance(int cpu, struct rq *rq)
-{
+	}
+unlock:
+	task_rq_unlock(rq, &flags);
 }
 
 #endif
@@ -5227,9 +3513,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
 
 	if (total) {
-		u64 temp;
+		u64 temp = rtime;
 
-		temp = (u64)(rtime * utime);
+		temp *= utime;
 		do_div(temp, total);
 		utime = (cputime_t)temp;
 	} else
@@ -5260,9 +3546,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
 
 	if (total) {
-		u64 temp;
+		u64 temp = rtime;
 
-		temp = (u64)(rtime * cputime.utime);
+		temp *= cputime.utime;
 		do_div(temp, total);
 		utime = (cputime_t)temp;
 	} else
@@ -5294,11 +3580,11 @@ void scheduler_tick(void)
 
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
-	update_cpu_load(rq);
+	update_cpu_load_active(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	raw_spin_unlock(&rq->lock);
 
-	perf_event_task_tick(curr, cpu);
+	perf_event_task_tick(curr);
 
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
@@ -5412,23 +3698,9 @@ static inline void schedule_debug(struct task_struct *prev)
 
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-	if (prev->state == TASK_RUNNING) {
-		u64 runtime = prev->se.sum_exec_runtime;
-
-		runtime -= prev->se.prev_sum_exec_runtime;
-		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-
-		/*
-		 * In order to avoid avg_overlap growing stale when we are
-		 * indeed overlapping and hence not getting put to sleep, grow
-		 * the avg_overlap on preemption.
-		 *
-		 * We use the average preemption runtime because that
-		 * correlates to the amount of cache footprint a task can
-		 * build up.
-		 */
-		update_avg(&prev->se.avg_overlap, runtime);
-	}
+	if (prev->se.on_rq)
+		update_rq_clock(rq);
+	rq->skip_clock_update = 0;
 	prev->sched_class->put_prev_task(rq, prev);
 }
 
@@ -5478,9 +3750,8 @@ need_resched:
 	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
-	rcu_sched_qs(cpu);
+	rcu_note_context_switch(cpu);
 	prev = rq->curr;
-	switch_count = &prev->nivcsw;
 
 	release_kernel_lock(prev);
 need_resched_nonpreemptible:
@@ -5491,14 +3762,28 @@ need_resched_nonpreemptible:
 		hrtick_clear(rq);
 
 	raw_spin_lock_irq(&rq->lock);
-	update_rq_clock(rq);
 	clear_tsk_need_resched(prev);
 
+	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-		if (unlikely(signal_pending_state(prev->state, prev)))
+		if (unlikely(signal_pending_state(prev->state, prev))) {
 			prev->state = TASK_RUNNING;
-		else
-			deactivate_task(rq, prev, 1);
+		} else {
+			/*
+			 * If a worker is going to sleep, notify and
+			 * ask workqueue whether it wants to wake up a
+			 * task to maintain concurrency.  If so, wake
+			 * up the task.
+			 */
+			if (prev->flags & PF_WQ_WORKER) {
+				struct task_struct *to_wakeup;
+
+				to_wakeup = wq_worker_sleeping(prev, cpu);
+				if (to_wakeup)
+					try_to_wake_up_local(to_wakeup);
+			}
+			deactivate_task(rq, prev, DEQUEUE_SLEEP);
+		}
 		switch_count = &prev->nvcsw;
 	}
 
@@ -5512,7 +3797,7 @@ need_resched_nonpreemptible:
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
-		perf_event_task_sched_out(prev, next, cpu);
+		perf_event_task_sched_out(prev, next);
 
 		rq->nr_switches++;
 		rq->curr = next;
@@ -5520,8 +3805,10 @@ need_resched_nonpreemptible:
 
 		context_switch(rq, prev, next); /* unlocks the rq */
 		/*
-		 * the context switch might have flipped the stack from under
-		 * us, hence refresh the local variables.
+		 * The context switch have flipped the stack from under us
+		 * and restored the local variables which were saved when
+		 * this task called schedule() in the past. prev == current
+		 * is still correct, but it can be moved to another cpu/rq.
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
@@ -5530,11 +3817,8 @@ need_resched_nonpreemptible:
 
 	post_schedule(rq);
 
-	if (unlikely(reacquire_kernel_lock(current) < 0)) {
-		prev = rq->curr;
-		switch_count = &prev->nivcsw;
+	if (unlikely(reacquire_kernel_lock(prev)))
 		goto need_resched_nonpreemptible;
-	}
 
 	preempt_enable_no_resched();
 	if (need_resched())
@@ -5562,7 +3846,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 	 * the mutex owner just released it and exited.
 	 */
 	if (probe_kernel_address(&owner->cpu, cpu))
-		goto out;
+		return 0;
 #else
 	cpu = owner->cpu;
 #endif
@@ -5572,14 +3856,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 	 * the cpu field may no longer be valid.
 	 */
 	if (cpu >= nr_cpumask_bits)
-		goto out;
+		return 0;
 
 	/*
 	 * We need to validate that we can do a
 	 * get_cpu() and that we have the percpu area.
 	 */
 	if (!cpu_online(cpu))
-		goto out;
+		return 0;
 
 	rq = cpu_rq(cpu);
 
@@ -5587,8 +3871,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 		/*
 		 * Owner changed, break to re-assess state.
 		 */
-		if (lock->owner != owner)
+		if (lock->owner != owner) {
+			/*
+			 * If the lock has switched to a different owner,
+			 * we likely have heavy contention. Return 0 to quit
+			 * optimistic spinning and not contend further:
+			 */
+			if (lock->owner)
+				return 0;
 			break;
+		}
 
 		/*
 		 * Is that owner really running on that cpu?
@@ -5598,7 +3890,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 
 		cpu_relax();
 	}
-out:
+
 	return 1;
 }
 #endif
@@ -5609,7 +3901,7 @@ out:
  * off of preempt_enable. Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
-asmlinkage void __sched preempt_schedule(void)
+asmlinkage void __sched notrace preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
 
@@ -5621,9 +3913,9 @@ asmlinkage void __sched preempt_schedule(void)
 		return;
 
 	do {
-		add_preempt_count(PREEMPT_ACTIVE);
+		add_preempt_count_notrace(PREEMPT_ACTIVE);
 		schedule();
-		sub_preempt_count(PREEMPT_ACTIVE);
+		sub_preempt_count_notrace(PREEMPT_ACTIVE);
 
 		/*
 		 * Check again in case we missed a preemption opportunity
@@ -5722,6 +4014,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
+EXPORT_SYMBOL_GPL(__wake_up_locked);
 
 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
@@ -5821,8 +4114,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 
-		wait.flags |= WQ_FLAG_EXCLUSIVE;
-		__add_wait_queue_tail(&x->wait, &wait);
+		__add_wait_queue_tail_exclusive(&x->wait, &wait);
 		do {
 			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
@@ -5933,6 +4225,23 @@ int __sched wait_for_completion_killable(struct completion *x)
 EXPORT_SYMBOL(wait_for_completion_killable);
 
 /**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ */
+unsigned long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+				     unsigned long timeout)
+{
+	return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
  *	try_wait_for_completion - try to decrement a completion without blocking
  *	@x:	completion structure
  *
@@ -6043,14 +4352,14 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	unsigned long flags;
 	int oldprio, on_rq, running;
 	struct rq *rq;
-	const struct sched_class *prev_class = p->sched_class;
+	const struct sched_class *prev_class;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 	rq = task_rq_lock(p, &flags);
-	update_rq_clock(rq);
 
 	oldprio = p->prio;
+	prev_class = p->sched_class;
 	on_rq = p->se.on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
@@ -6068,7 +4377,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq) {
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
@@ -6090,7 +4399,6 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
-	update_rq_clock(rq);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
@@ -6135,7 +4443,7 @@ int can_nice(const struct task_struct *p, const int nice)
 	/* convert nice value [19,-20] to rlimit style value [1,40] */
 	int nice_rlim = 20 - nice;
 
-	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
 		capable(CAP_SYS_NICE));
 }
 
@@ -6270,7 +4578,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
 {
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
-	const struct sched_class *prev_class = p->sched_class;
+	const struct sched_class *prev_class;
 	struct rq *rq;
 	int reset_on_fork;
 
@@ -6308,12 +4616,8 @@ recheck:
 	 */
 	if (user && !capable(CAP_SYS_NICE)) {
 		if (rt_policy(policy)) {
-			unsigned long rlim_rtprio;
-
-			if (!lock_task_sighand(p, &flags))
-				return -ESRCH;
-			rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
-			unlock_task_sighand(p, &flags);
+			unsigned long rlim_rtprio =
+					task_rlimit(p, RLIMIT_RTPRIO);
 
 			/* can't set/change the rt policy */
 			if (policy != p->policy && !rlim_rtprio)
@@ -6341,16 +4645,6 @@ recheck:
 	}
 
 	if (user) {
-#ifdef CONFIG_RT_GROUP_SCHED
-		/*
-		 * Do not allow realtime tasks into groups that have no runtime
-		 * assigned.
-		 */
-		if (rt_bandwidth_enabled() && rt_policy(policy) &&
-				task_group(p)->rt_bandwidth.rt_runtime == 0)
-			return -EPERM;
-#endif
-
 		retval = security_task_setscheduler(p, policy, param);
 		if (retval)
 			return retval;
@@ -6366,6 +4660,22 @@ recheck:
 	 * runqueue lock must be held.
 	 */
 	rq = __task_rq_lock(p);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (user) {
+		/*
+		 * Do not allow realtime tasks into groups that have no runtime
+		 * assigned.
+		 */
+		if (rt_bandwidth_enabled() && rt_policy(policy) &&
+				task_group(p)->rt_bandwidth.rt_runtime == 0) {
+			__task_rq_unlock(rq);
+			raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+			return -EPERM;
+		}
+	}
+#endif
+
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
@@ -6373,7 +4683,6 @@ recheck:
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
-	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
@@ -6384,6 +4693,7 @@ recheck:
 	p->sched_reset_on_fork = reset_on_fork;
 
 	oldprio = p->prio;
+	prev_class = p->sched_class;
 	__setscheduler(rq, p, policy, param->sched_priority);
 
 	if (running)
@@ -6683,7 +4993,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 	int ret;
 	cpumask_var_t mask;
 
-	if (len < cpumask_size())
+	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+		return -EINVAL;
+	if (len & (sizeof(unsigned long)-1))
 		return -EINVAL;
 
 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -6691,10 +5003,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 
 	ret = sched_getaffinity(pid, mask);
 	if (ret == 0) {
-		if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+		size_t retlen = min_t(size_t, len, cpumask_size());
+
+		if (copy_to_user(user_mask_ptr, mask, retlen))
 			ret = -EFAULT;
 		else
-			ret = cpumask_size();
+			ret = retlen;
 	}
 	free_cpumask_var(mask);
 
@@ -7105,17 +5419,15 @@ static inline void sched_init_granularity(void)
 /*
  * This is how migration works:
  *
- * 1) we queue a struct migration_req structure in the source CPU's
- *    runqueue and wake up that CPU's migration thread.
- * 2) we down() the locked semaphore => thread blocks.
- * 3) migration thread wakes up (implicitly it forces the migrated
- *    thread off the CPU)
- * 4) it gets the migration request and checks whether the migrated
- *    task is still in the wrong runqueue.
- * 5) if it's in the wrong runqueue then the migration thread removes
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ *    stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ *    off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
- * 6) migration thread up()s the semaphore.
- * 7) we wake up and the migration is done.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ *    is done.
  */
 
 /*
@@ -7129,24 +5441,20 @@ static inline void sched_init_granularity(void)
  */
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
+	unsigned int dest_cpu;
 	int ret = 0;
 
 	/*
-	 * Since we rely on wake-ups to migrate sleeping tasks, don't change
-	 * the ->cpus_allowed mask from under waking tasks, which would be
-	 * possible when we change rq->lock in ttwu(), so synchronize against
-	 * TASK_WAKING to avoid that.
+	 * Serialize against TASK_WAKING so that ttwu() and wunt() can
+	 * drop the rq->lock and still rely on ->cpus_allowed.
 	 */
 again:
-	while (p->state == TASK_WAKING)
+	while (task_is_waking(p))
 		cpu_relax();
-
 	rq = task_rq_lock(p, &flags);
-
-	if (p->state == TASK_WAKING) {
+	if (task_is_waking(p)) {
 		task_rq_unlock(rq, &flags);
 		goto again;
 	}
@@ -7173,15 +5481,12 @@ again:
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 
-	if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
+	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+	if (migrate_task(p, dest_cpu)) {
+		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
-		struct task_struct *mt = rq->migration_thread;
-
-		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
-		wake_up_process(rq->migration_thread);
-		put_task_struct(mt);
-		wait_for_completion(&req.done);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
@@ -7239,98 +5544,49 @@ fail:
 	return ret;
 }
 
-#define RCU_MIGRATION_IDLE	0
-#define RCU_MIGRATION_NEED_QS	1
-#define RCU_MIGRATION_GOT_QS	2
-#define RCU_MIGRATION_MUST_SYNC	3
-
 /*
- * migration_thread - this is a highprio system thread that performs
- * thread migration by bumping thread off CPU then 'pushing' onto
- * another runqueue.
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
  */
-static int migration_thread(void *data)
-{
-	int badcpu;
-	int cpu = (long)data;
-	struct rq *rq;
-
-	rq = cpu_rq(cpu);
-	BUG_ON(rq->migration_thread != current);
-
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		struct migration_req *req;
-		struct list_head *head;
-
-		raw_spin_lock_irq(&rq->lock);
-
-		if (cpu_is_offline(cpu)) {
-			raw_spin_unlock_irq(&rq->lock);
-			break;
-		}
-
-		if (rq->active_balance) {
-			active_load_balance(rq, cpu);
-			rq->active_balance = 0;
-		}
-
-		head = &rq->migration_queue;
-
-		if (list_empty(head)) {
-			raw_spin_unlock_irq(&rq->lock);
-			schedule();
-			set_current_state(TASK_INTERRUPTIBLE);
-			continue;
-		}
-		req = list_entry(head->next, struct migration_req, list);
-		list_del_init(head->next);
-
-		if (req->task != NULL) {
-			raw_spin_unlock(&rq->lock);
-			__migrate_task(req->task, cpu, req->dest_cpu);
-		} else if (likely(cpu == (badcpu = smp_processor_id()))) {
-			req->dest_cpu = RCU_MIGRATION_GOT_QS;
-			raw_spin_unlock(&rq->lock);
-		} else {
-			req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
-			raw_spin_unlock(&rq->lock);
-			WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
-		}
-		local_irq_enable();
-
-		complete(&req->done);
-	}
-	__set_current_state(TASK_RUNNING);
-
-	return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
+static int migration_cpu_stop(void *data)
 {
-	int ret;
+	struct migration_arg *arg = data;
 
+	/*
+	 * The original target cpu might have gone down and we might
+	 * be on another cpu but it doesn't matter.
+	 */
 	local_irq_disable();
-	ret = __migrate_task(p, src_cpu, dest_cpu);
+	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
 	local_irq_enable();
-	return ret;
+	return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
  */
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-	int dest_cpu;
+	struct rq *rq = cpu_rq(dead_cpu);
+	int needs_cpu, uninitialized_var(dest_cpu);
+	unsigned long flags;
 
-again:
-	dest_cpu = select_fallback_rq(dead_cpu, p);
+	local_irq_save(flags);
 
-	/* It can have affinity changed while we were choosing. */
-	if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
-		goto again;
+	raw_spin_lock(&rq->lock);
+	needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
+	if (needs_cpu)
+		dest_cpu = select_fallback_rq(dead_cpu, p);
+	raw_spin_unlock(&rq->lock);
+	/*
+	 * It can only fail if we race with set_cpus_allowed(),
+	 * in the racer should migrate the task anyway.
+	 */
+	if (needs_cpu)
+		__migrate_task(p, dead_cpu, dest_cpu);
+	local_irq_restore(flags);
 }
 
 /*
@@ -7394,7 +5650,6 @@ void sched_idle_next(void)
 
 	__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 
-	update_rq_clock(rq);
 	activate_task(rq, p, 0);
 
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -7449,7 +5704,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 	for ( ; ; ) {
 		if (!rq->nr_running)
 			break;
-		update_rq_clock(rq);
 		next = pick_next_task(rq);
 		if (!next)
 			break;
@@ -7672,35 +5926,20 @@ static void set_rq_offline(struct rq *rq)
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
-	struct task_struct *p;
 	int cpu = (long)hcpu;
 	unsigned long flags;
-	struct rq *rq;
+	struct rq *rq = cpu_rq(cpu);
 
 	switch (action) {
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
-		if (IS_ERR(p))
-			return NOTIFY_BAD;
-		kthread_bind(p, cpu);
-		/* Must be high prio: stop_machine expects to yield to it. */
-		rq = task_rq_lock(p, &flags);
-		__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-		task_rq_unlock(rq, &flags);
-		get_task_struct(p);
-		cpu_rq(cpu)->migration_thread = p;
 		rq->calc_load_update = calc_load_update;
 		break;
 
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		/* Strictly unnecessary, as first user will wake it. */
-		wake_up_process(cpu_rq(cpu)->migration_thread);
-
 		/* Update our root-domain */
-		rq = cpu_rq(cpu);
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7711,61 +5950,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		if (!cpu_rq(cpu)->migration_thread)
-			break;
-		/* Unbind it from offline cpu so it can run. Fall thru. */
-		kthread_bind(cpu_rq(cpu)->migration_thread,
-			     cpumask_any(cpu_online_mask));
-		kthread_stop(cpu_rq(cpu)->migration_thread);
-		put_task_struct(cpu_rq(cpu)->migration_thread);
-		cpu_rq(cpu)->migration_thread = NULL;
-		break;
-
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
 		migrate_live_tasks(cpu);
-		rq = cpu_rq(cpu);
-		kthread_stop(rq->migration_thread);
-		put_task_struct(rq->migration_thread);
-		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		raw_spin_lock_irq(&rq->lock);
-		update_rq_clock(rq);
 		deactivate_task(rq, rq->idle, 0);
 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
 		rq->idle->sched_class = &idle_sched_class;
 		migrate_dead_tasks(cpu);
 		raw_spin_unlock_irq(&rq->lock);
-		cpuset_unlock();
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
 		calc_global_load_remove(rq);
-		/*
-		 * No need to migrate the tasks: it was best-effort if
-		 * they didn't take sched_hotcpu_mutex. Just wake up
-		 * the requestors.
-		 */
-		raw_spin_lock_irq(&rq->lock);
-		while (!list_empty(&rq->migration_queue)) {
-			struct migration_req *req;
-
-			req = list_entry(rq->migration_queue.next,
-					 struct migration_req, list);
-			list_del_init(&req->list);
-			raw_spin_unlock_irq(&rq->lock);
-			complete(&req->done);
-			raw_spin_lock_irq(&rq->lock);
-		}
-		raw_spin_unlock_irq(&rq->lock);
 		break;
 
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/* Update our root-domain */
-		rq = cpu_rq(cpu);
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7785,20 +5987,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
-	.priority = 10
+	.priority = CPU_PRI_MIGRATION,
 };
 
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+				      unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		set_cpu_active((long)hcpu, true);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		set_cpu_active((long)hcpu, false);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
 static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
 
-	/* Start one for the boot CPU: */
+	/* Initialize migration for the boot CPU */
 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 
+	/* Register cpu active notifiers */
+	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
 	return 0;
 }
 early_initcall(migration_init);
@@ -8033,23 +6264,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 		free_rootdomain(old_rd);
 }
 
-static int init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd)
 {
-	gfp_t gfp = GFP_KERNEL;
-
 	memset(rd, 0, sizeof(*rd));
 
-	if (bootmem)
-		gfp = GFP_NOWAIT;
-
-	if (!alloc_cpumask_var(&rd->span, gfp))
+	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
 		goto out;
-	if (!alloc_cpumask_var(&rd->online, gfp))
+	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
 		goto free_span;
-	if (!alloc_cpumask_var(&rd->rto_mask, gfp))
+	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
 		goto free_online;
 
-	if (cpupri_init(&rd->cpupri, bootmem) != 0)
+	if (cpupri_init(&rd->cpupri) != 0)
 		goto free_rto_mask;
 	return 0;
 
@@ -8065,7 +6291,7 @@ out:
 
 static void init_defrootdomain(void)
 {
-	init_rootdomain(&def_root_domain, true);
+	init_rootdomain(&def_root_domain);
 
 	atomic_set(&def_root_domain.refcount, 1);
 }
@@ -8078,7 +6304,7 @@ static struct root_domain *alloc_rootdomain(void)
 	if (!rd)
 		return NULL;
 
-	if (init_rootdomain(rd, false) != 0) {
+	if (init_rootdomain(rd) != 0) {
 		kfree(rd);
 		return NULL;
 	}
@@ -8096,6 +6322,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 
+	for (tmp = sd; tmp; tmp = tmp->parent)
+		tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
+
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; ) {
 		struct sched_domain *parent = tmp->parent;
@@ -9202,11 +7431,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+					   struct sysdev_class_attribute *attr,
 					   char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+					    struct sysdev_class_attribute *attr,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
@@ -9218,11 +7449,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
 
 #ifdef CONFIG_SCHED_SMT
 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+					    struct sysdev_class_attribute *attr,
 					    char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+					     struct sysdev_class_attribute *attr,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
@@ -9250,29 +7483,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
-#ifndef CONFIG_CPUSETS
 /*
- * Add online and remove offline CPUs from the scheduler domains.
- * When cpusets are enabled they take over this function.
+ * Update cpusets according to cpu_active mask.  If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
  */
-static int update_sched_domains(struct notifier_block *nfb,
-				unsigned long action, void *hcpu)
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+			     void *hcpu)
 {
-	switch (action) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		partition_sched_domains(1, NULL, NULL);
+		cpuset_update_active_cpus();
 		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
 
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+			       void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		cpuset_update_active_cpus();
+		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
-#endif
 
 static int update_runtime(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
@@ -9318,10 +7557,8 @@ void __init sched_init_smp(void)
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
 
-#ifndef CONFIG_CPUSETS
-	/* XXX: Theoretical race here - CPU may be hotplugged now */
-	hotcpu_notifier(update_sched_domains, 0);
-#endif
+	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
 
 	/* RT runtime code needs to handle some hotplug events */
 	hotcpu_notifier(update_runtime, 0);
@@ -9437,7 +7674,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 	tg->rt_rq[cpu] = rt_rq;
 	init_rt_rq(rt_rq, rq);
 	rt_rq->tg = tg;
-	rt_rq->rt_se = rt_se;
 	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 	if (add)
 		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9468,9 +7704,6 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
-#ifdef CONFIG_USER_SCHED
-	alloc_size *= 2;
-#endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
@@ -9484,13 +7717,6 @@ void __init sched_init(void)
 		init_task_group.cfs_rq = (struct cfs_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 
-#ifdef CONFIG_USER_SCHED
-		root_task_group.se = (struct sched_entity **)ptr;
-		ptr += nr_cpu_ids * sizeof(void **);
-
-		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
-		ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 		init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9499,13 +7725,6 @@ void __init sched_init(void)
 		init_task_group.rt_rq = (struct rt_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 
-#ifdef CONFIG_USER_SCHED
-		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
-		ptr += nr_cpu_ids * sizeof(void **);
-
-		root_task_group.rt_rq = (struct rt_rq **)ptr;
-		ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
 		for_each_possible_cpu(i) {
@@ -9525,22 +7744,13 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
 	init_rt_bandwidth(&init_task_group.rt_bandwidth,
 			global_rt_period(), global_rt_runtime());
-#ifdef CONFIG_USER_SCHED
-	init_rt_bandwidth(&root_task_group.rt_bandwidth,
-			global_rt_period(), RUNTIME_INF);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&init_task_group.children);
 
-#ifdef CONFIG_USER_SCHED
-	INIT_LIST_HEAD(&root_task_group.children);
-	init_task_group.parent = &root_task_group;
-	list_add(&init_task_group.siblings, &root_task_group.children);
-#endif /* CONFIG_USER_SCHED */
-#endif /* CONFIG_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
 
 #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
 	update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9580,25 +7790,6 @@ void __init sched_init(void)
 		 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
 		 */
 		init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#elif defined CONFIG_USER_SCHED
-		root_task_group.shares = NICE_0_LOAD;
-		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
-		/*
-		 * In case of task-groups formed thr' the user id of tasks,
-		 * init_task_group represents tasks belonging to root user.
-		 * Hence it forms a sibling of all subsequent groups formed.
-		 * In this case, init_task_group gets only a fraction of overall
-		 * system cpu resource, based on the weight assigned to root
-		 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
-		 * by letting tasks of init_task_group sit in a separate cfs_rq
-		 * (init_tg_cfs_rq) and having one entity represent this group of
-		 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
-		 */
-		init_tg_cfs_entry(&init_task_group,
-				&per_cpu(init_tg_cfs_rq, i),
-				&per_cpu(init_sched_entity, i), i, 1,
-				root_task_group.se[i]);
-
 #endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -9607,31 +7798,31 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
 		init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#elif defined CONFIG_USER_SCHED
-		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
-		init_tg_rt_entry(&init_task_group,
-				&per_cpu(init_rt_rq_var, i),
-				&per_cpu(init_sched_rt_entity, i), i, 1,
-				root_task_group.rt_se[i]);
 #endif
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
+
+		rq->last_load_update_tick = jiffies;
+
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
+		rq->cpu_power = SCHED_LOAD_SCALE;
 		rq->post_schedule = 0;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->online = 0;
-		rq->migration_thread = NULL;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
-		INIT_LIST_HEAD(&rq->migration_queue);
 		rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ
+		rq->nohz_balance_kick = 0;
+		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
+#endif
 #endif
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
@@ -9676,8 +7867,11 @@ void __init sched_init(void)
 	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-	zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
-	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
+	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+	alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
+	atomic_set(&nohz.load_balancer, nr_cpu_ids);
+	atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
+	atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
 #endif
 	/* May be allocated at isolcpus cmdline parse time */
 	if (cpu_isolated_map == NULL)
@@ -9697,7 +7891,7 @@ static inline int preempt_count_equals(int preempt_offset)
 	return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
 }
 
-void __might_sleep(char *file, int line, int preempt_offset)
+void __might_sleep(const char *file, int line, int preempt_offset)
 {
 #ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
@@ -9731,7 +7925,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 {
 	int on_rq;
 
-	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		deactivate_task(rq, p, 0);
@@ -9758,9 +7951,9 @@ void normalize_rt_tasks(void)
 
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
-		p->se.wait_start		= 0;
-		p->se.sleep_start		= 0;
-		p->se.block_start		= 0;
+		p->se.statistics.wait_start	= 0;
+		p->se.statistics.sleep_start	= 0;
+		p->se.statistics.block_start	= 0;
 #endif
 
 		if (!rt_task(p)) {
@@ -9787,9 +7980,9 @@ void normalize_rt_tasks(void)
 
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-#ifdef CONFIG_IA64
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
 /*
- * These functions are only useful for the IA64 MCA handling.
+ * These functions are only useful for the IA64 MCA handling, or kdb.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
@@ -9809,6 +8002,9 @@ struct task_struct *curr_task(int cpu)
 	return cpu_curr(cpu);
 }
 
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
 /**
  * set_curr_task - set the current task for a given cpu.
  * @cpu: the processor in question.
@@ -10008,7 +8204,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 static void free_sched_group(struct task_group *tg)
 {
 	free_fair_sched_group(tg);
@@ -10093,8 +8289,6 @@ void sched_move_task(struct task_struct *tsk)
 
 	rq = task_rq_lock(tsk, &flags);
 
-	update_rq_clock(rq);
-
 	running = task_current(rq, tsk);
 	on_rq = tsk->se.on_rq;
 
@@ -10117,7 +8311,7 @@ void sched_move_task(struct task_struct *tsk)
 
 	task_rq_unlock(rq, &flags);
 }
-#endif /* CONFIG_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10259,13 +8453,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
 		runtime = d->rt_runtime;
 	}
 
-#ifdef CONFIG_USER_SCHED
-	if (tg == &root_task_group) {
-		period = global_rt_period();
-		runtime = global_rt_runtime();
-	}
-#endif
-
 	/*
 	 * Cannot have more runtime than the period.
 	 */
@@ -10668,7 +8855,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
-	u64 *cpuusage;
+	u64 __percpu *cpuusage;
 	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
 	struct cpuacct *parent;
 };
@@ -10885,12 +9072,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 }
 
 /*
+ * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
+ * in cputime_t units. As a result, cpuacct_update_stats calls
+ * percpu_counter_add with values large enough to always overflow the
+ * per cpu batch limit causing bad SMP scalability.
+ *
+ * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
+ * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
+ * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
+ */
+#ifdef CONFIG_SMP
+#define CPUACCT_BATCH	\
+	min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
+#else
+#define CPUACCT_BATCH	0
+#endif
+
+/*
  * Charge the system/user time to the task's accounting group.
  */
 static void cpuacct_update_stats(struct task_struct *tsk,
 		enum cpuacct_stat_index idx, cputime_t val)
 {
 	struct cpuacct *ca;
+	int batch = CPUACCT_BATCH;
 
 	if (unlikely(!cpuacct_subsys.active))
 		return;
@@ -10899,7 +9104,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
 	ca = task_ca(tsk);
 
 	do {
-		percpu_counter_add(&ca->cpustat[idx], val);
+		__percpu_counter_add(&ca->cpustat[idx], val, batch);
 		ca = ca->parent;
 	} while (ca);
 	rcu_read_unlock();
@@ -10916,43 +9121,32 @@ struct cgroup_subsys cpuacct_subsys = {
 
 #ifndef CONFIG_SMP
 
-int rcu_expedited_torture_stats(char *page)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
 void synchronize_sched_expedited(void)
 {
+	barrier();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
 #else /* #ifndef CONFIG_SMP */
 
-static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
-static DEFINE_MUTEX(rcu_sched_expedited_mutex);
-
-#define RCU_EXPEDITED_STATE_POST -2
-#define RCU_EXPEDITED_STATE_IDLE -1
-
-static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
 
-int rcu_expedited_torture_stats(char *page)
+static int synchronize_sched_expedited_cpu_stop(void *data)
 {
-	int cnt = 0;
-	int cpu;
-
-	cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
-	for_each_online_cpu(cpu) {
-		 cnt += sprintf(&page[cnt], " %d:%d",
-				cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
-	}
-	cnt += sprintf(&page[cnt], "\n");
-	return cnt;
+	/*
+	 * There must be a full memory barrier on each affected CPU
+	 * between the time that try_stop_cpus() is called and the
+	 * time that it returns.
+	 *
+	 * In the current initial implementation of cpu_stop, the
+	 * above condition is already met when the control reaches
+	 * this point and the following smp_mb() is not strictly
+	 * necessary.  Do smp_mb() anyway for documentation and
+	 * robustness against future implementation changes.
+	 */
+	smp_mb(); /* See above comment block. */
+	return 0;
 }
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
-static long synchronize_sched_expedited_count;
 
 /*
  * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -10966,18 +9160,14 @@ static long synchronize_sched_expedited_count;
  */
 void synchronize_sched_expedited(void)
 {
-	int cpu;
-	unsigned long flags;
-	bool need_full_sync = 0;
-	struct rq *rq;
-	struct migration_req *req;
-	long snap;
-	int trycount = 0;
+	int snap, trycount = 0;
 
 	smp_mb();  /* ensure prior mod happens before capturing snap. */
-	snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
 	get_online_cpus();
-	while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+	while (try_stop_cpus(cpu_online_mask,
+			     synchronize_sched_expedited_cpu_stop,
+			     NULL) == -EAGAIN) {
 		put_online_cpus();
 		if (trycount++ < 10)
 			udelay(trycount * num_online_cpus());
@@ -10985,41 +9175,15 @@ void synchronize_sched_expedited(void)
 			synchronize_sched();
 			return;
 		}
-		if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
 			smp_mb(); /* ensure test happens before caller kfree */
 			return;
 		}
 		get_online_cpus();
 	}
-	rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
-	for_each_online_cpu(cpu) {
-		rq = cpu_rq(cpu);
-		req = &per_cpu(rcu_migration_req, cpu);
-		init_completion(&req->done);
-		req->task = NULL;
-		req->dest_cpu = RCU_MIGRATION_NEED_QS;
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		list_add(&req->list, &rq->migration_queue);
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		wake_up_process(rq->migration_thread);
-	}
-	for_each_online_cpu(cpu) {
-		rcu_expedited_state = cpu;
-		req = &per_cpu(rcu_migration_req, cpu);
-		rq = cpu_rq(cpu);
-		wait_for_completion(&req->done);
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
-			need_full_sync = 1;
-		req->dest_cpu = RCU_MIGRATION_IDLE;
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-	}
-	rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-	synchronize_sched_expedited_count++;
-	mutex_unlock(&rcu_sched_expedited_mutex);
+	atomic_inc(&synchronize_sched_expedited_count);
+	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
 	put_online_cpus();
-	if (need_full_sync)
-		synchronize_sched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28..52f1a149bfb 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
  *   Ingo Molnar <mingo@redhat.com>
  *   Guillaume Chazarain <guichaz@gmail.com>
  *
- * Create a semi stable clock from a mixture of other events, including:
- *  - gtod
+ *
+ * What:
+ *
+ * cpu_clock(i) provides a fast (execution time) high resolution
+ * clock with bounded drift between CPUs. The value of cpu_clock(i)
+ * is monotonic for constant i. The timestamp returned is in nanoseconds.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ *
+ * There is no strict promise about the base, although it tends to start
+ * at 0 on boot (but people really shouldn't rely on that).
+ *
+ * cpu_clock(i)       -- can be used from any context, including NMI.
+ * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
+ * local_clock()      -- is cpu_clock() on the current cpu.
+ *
+ * How:
+ *
+ * The implementation either uses sched_clock() when
+ * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
+ * sched_clock() is assumed to provide these properties (mostly it means
+ * the architecture provides a globally synchronized highres time source).
+ *
+ * Otherwise it tries to create a semi stable clock from a mixture of other
+ * clocks, including:
+ *
+ *  - GTOD (clock monotomic)
  *  - sched_clock()
  *  - explicit idle events
  *
- * We use gtod as base and the unstable clock deltas. The deltas are filtered,
- * making it monotonic and keeping it within an expected window.
+ * We use GTOD as base and use sched_clock() deltas to improve resolution. The
+ * deltas are filtered to provide monotonicity and keeping it within an
+ * expected window.
  *
  * Furthermore, explicit sleep and wakeup hooks allow us to account for time
  * that is otherwise invisible (TSC gets stopped).
  *
- * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
- * consistent between cpus (never more than 2 jiffies difference).
+ *
+ * Notes:
+ *
+ * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
+ * like cpufreq interrupts that can change the base clock (TSC) multiplier
+ * and cause funny jumps in time -- although the filtering provided by
+ * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
+ * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
+ * sched_clock().
  */
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
@@ -41,6 +77,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
 					* (NSEC_PER_SEC / HZ);
 }
+EXPORT_SYMBOL_GPL(sched_clock);
 
 static __read_mostly int sched_clock_running;
 
@@ -169,6 +206,11 @@ again:
 	return val;
 }
 
+/*
+ * Similar to cpu_clock(), but requires local IRQs to be disabled.
+ *
+ * See cpu_clock().
+ */
 u64 sched_clock_cpu(int cpu)
 {
 	struct sched_clock_data *scd;
@@ -236,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-unsigned long long cpu_clock(int cpu)
+/*
+ * As outlined at the top, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ */
+u64 cpu_clock(int cpu)
 {
-	unsigned long long clock;
+	u64 clock;
 	unsigned long flags;
 
 	local_irq_save(flags);
@@ -248,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
 	return clock;
 }
 
+/*
+ * Similar to cpu_clock() for the current cpu. Time will only be observed
+ * to be monotonic if care is taken to only compare timestampt taken on the
+ * same CPU.
+ *
+ * See cpu_clock().
+ */
+u64 local_clock(void)
+{
+	u64 clock;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	clock = sched_clock_cpu(smp_processor_id());
+	local_irq_restore(flags);
+
+	return clock;
+}
+
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 void sched_clock_init(void)
@@ -263,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
 	return sched_clock();
 }
 
-
-unsigned long long cpu_clock(int cpu)
+u64 cpu_clock(int cpu)
 {
 	return sched_clock_cpu(cpu);
 }
 
+u64 local_clock(void)
+{
+	return sched_clock_cpu(0);
+}
+
 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 EXPORT_SYMBOL_GPL(cpu_clock);
+EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099df..2722dc1b413 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -27,6 +27,7 @@
  *  of the License.
  */
 
+#include <linux/gfp.h>
 #include "sched_cpupri.h"
 
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -47,9 +48,7 @@ static int convert_prio(int prio)
 }
 
 #define for_each_cpupri_active(array, idx)                    \
-  for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES);     \
-       idx < CPUPRI_NR_PRIORITIES;                            \
-       idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
+	for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
 
 /**
  * cpupri_find - find the best (lowest-pri) CPU in the system
@@ -58,7 +57,7 @@ static int convert_prio(int prio)
  * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
  *
  * Note: This function returns the recommended CPUs as calculated during the
- * current invokation.  By the time the call returns, the CPUs may have in
+ * current invocation.  By the time the call returns, the CPUs may have in
  * fact changed priorities any number of times.  While not ideal, it is not
  * an issue of correctness since the normal rebalancer logic will correct
  * any discrepancies created by racing against the uncertainty of the current
@@ -167,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
  *
  * Returns: -ENOMEM if memory fails.
  */
-int cpupri_init(struct cpupri *cp, bool bootmem)
+int cpupri_init(struct cpupri *cp)
 {
-	gfp_t gfp = GFP_KERNEL;
 	int i;
 
-	if (bootmem)
-		gfp = GFP_NOWAIT;
-
 	memset(cp, 0, sizeof(*cp));
 
 	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -182,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
 
 		raw_spin_lock_init(&vec->lock);
 		vec->count = 0;
-		if (!zalloc_cpumask_var(&vec->mask, gfp))
+		if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
 			goto cleanup;
 	}
 
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 7cb5bb6b95b..9fc7d386fea 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -27,7 +27,7 @@ struct cpupri {
 int  cpupri_find(struct cpupri *cp,
 		 struct task_struct *p, struct cpumask *lowest_mask);
 void cpupri_set(struct cpupri *cp, int cpu, int pri);
-int cpupri_init(struct cpupri *cp, bool bootmem);
+int cpupri_init(struct cpupri *cp);
 void cpupri_cleanup(struct cpupri *cp);
 #else
 #define cpupri_set(cp, cpu, pri) do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 67f95aada4b..2e1b0d17dd9 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
 	PN(se->vruntime);
 	PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
-	PN(se->wait_start);
-	PN(se->sleep_start);
-	PN(se->block_start);
-	PN(se->sleep_max);
-	PN(se->block_max);
-	PN(se->exec_max);
-	PN(se->slice_max);
-	PN(se->wait_max);
-	PN(se->wait_sum);
-	P(se->wait_count);
+	PN(se->statistics.wait_start);
+	PN(se->statistics.sleep_start);
+	PN(se->statistics.block_start);
+	PN(se->statistics.sleep_max);
+	PN(se->statistics.block_max);
+	PN(se->statistics.exec_max);
+	PN(se->statistics.slice_max);
+	PN(se->statistics.wait_max);
+	PN(se->statistics.wait_sum);
+	P(se->statistics.wait_count);
 #endif
 	P(se->load.weight);
 #undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
 		SPLIT_NS(p->se.vruntime),
 		SPLIT_NS(p->se.sum_exec_runtime),
-		SPLIT_NS(p->se.sum_sleep_runtime));
+		SPLIT_NS(p->se.statistics.sum_sleep_runtime));
 #else
 	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
 		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 	{
 		char path[64];
 
+		rcu_read_lock();
 		cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
+		rcu_read_unlock();
 		SEQ_printf(m, " %s", path);
 	}
 #endif
@@ -173,11 +175,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	task_group_path(tg, path, sizeof(path));
 
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-	{
-		uid_t uid = cfs_rq->tg->uid;
-		SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
-	}
 #else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
@@ -335,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	PN(sysctl_sched_latency);
 	PN(sysctl_sched_min_granularity);
 	PN(sysctl_sched_wakeup_granularity);
-	PN(sysctl_sched_child_runs_first);
+	P(sysctl_sched_child_runs_first);
 	P(sysctl_sched_features);
 #undef PN
 #undef P
@@ -384,15 +381,9 @@ __initcall(init_sched_debug_procfs);
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
 	unsigned long nr_switches;
-	unsigned long flags;
-	int num_threads = 1;
-
-	if (lock_task_sighand(p, &flags)) {
-		num_threads = atomic_read(&p->signal->count);
-		unlock_task_sighand(p, &flags);
-	}
 
-	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
+	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
+						get_nr_threads(p));
 	SEQ_printf(m,
 		"---------------------------------------------------------\n");
 #define __P(F) \
@@ -407,40 +398,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.exec_start);
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
-	PN(se.avg_overlap);
-	PN(se.avg_wakeup);
 
 	nr_switches = p->nvcsw + p->nivcsw;
 
 #ifdef CONFIG_SCHEDSTATS
-	PN(se.wait_start);
-	PN(se.sleep_start);
-	PN(se.block_start);
-	PN(se.sleep_max);
-	PN(se.block_max);
-	PN(se.exec_max);
-	PN(se.slice_max);
-	PN(se.wait_max);
-	PN(se.wait_sum);
-	P(se.wait_count);
-	PN(se.iowait_sum);
-	P(se.iowait_count);
+	PN(se.statistics.wait_start);
+	PN(se.statistics.sleep_start);
+	PN(se.statistics.block_start);
+	PN(se.statistics.sleep_max);
+	PN(se.statistics.block_max);
+	PN(se.statistics.exec_max);
+	PN(se.statistics.slice_max);
+	PN(se.statistics.wait_max);
+	PN(se.statistics.wait_sum);
+	P(se.statistics.wait_count);
+	PN(se.statistics.iowait_sum);
+	P(se.statistics.iowait_count);
 	P(sched_info.bkl_count);
 	P(se.nr_migrations);
-	P(se.nr_migrations_cold);
-	P(se.nr_failed_migrations_affine);
-	P(se.nr_failed_migrations_running);
-	P(se.nr_failed_migrations_hot);
-	P(se.nr_forced_migrations);
-	P(se.nr_wakeups);
-	P(se.nr_wakeups_sync);
-	P(se.nr_wakeups_migrate);
-	P(se.nr_wakeups_local);
-	P(se.nr_wakeups_remote);
-	P(se.nr_wakeups_affine);
-	P(se.nr_wakeups_affine_attempts);
-	P(se.nr_wakeups_passive);
-	P(se.nr_wakeups_idle);
+	P(se.statistics.nr_migrations_cold);
+	P(se.statistics.nr_failed_migrations_affine);
+	P(se.statistics.nr_failed_migrations_running);
+	P(se.statistics.nr_failed_migrations_hot);
+	P(se.statistics.nr_forced_migrations);
+	P(se.statistics.nr_wakeups);
+	P(se.statistics.nr_wakeups_sync);
+	P(se.statistics.nr_wakeups_migrate);
+	P(se.statistics.nr_wakeups_local);
+	P(se.statistics.nr_wakeups_remote);
+	P(se.statistics.nr_wakeups_affine);
+	P(se.statistics.nr_wakeups_affine_attempts);
+	P(se.statistics.nr_wakeups_passive);
+	P(se.statistics.nr_wakeups_idle);
 
 	{
 		u64 avg_atom, avg_per_cpu;
@@ -491,35 +480,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-	p->se.wait_max				= 0;
-	p->se.wait_sum				= 0;
-	p->se.wait_count			= 0;
-	p->se.iowait_sum			= 0;
-	p->se.iowait_count			= 0;
-	p->se.sleep_max				= 0;
-	p->se.sum_sleep_runtime			= 0;
-	p->se.block_max				= 0;
-	p->se.exec_max				= 0;
-	p->se.slice_max				= 0;
-	p->se.nr_migrations			= 0;
-	p->se.nr_migrations_cold		= 0;
-	p->se.nr_failed_migrations_affine	= 0;
-	p->se.nr_failed_migrations_running	= 0;
-	p->se.nr_failed_migrations_hot		= 0;
-	p->se.nr_forced_migrations		= 0;
-	p->se.nr_wakeups			= 0;
-	p->se.nr_wakeups_sync			= 0;
-	p->se.nr_wakeups_migrate		= 0;
-	p->se.nr_wakeups_local			= 0;
-	p->se.nr_wakeups_remote			= 0;
-	p->se.nr_wakeups_affine			= 0;
-	p->se.nr_wakeups_affine_attempts	= 0;
-	p->se.nr_wakeups_passive		= 0;
-	p->se.nr_wakeups_idle			= 0;
-	p->sched_info.bkl_count			= 0;
+	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
-	p->se.sum_exec_runtime			= 0;
-	p->se.prev_sum_exec_runtime		= 0;
-	p->nvcsw				= 0;
-	p->nivcsw				= 0;
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c55..db3f674ca49 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
  * (to see the precise effective timeslice length of your workload,
  *  run vmstat and monitor the context-switches (cs) field)
  */
-unsigned int sysctl_sched_latency = 5000000ULL;
-unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
 /*
  * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_min_granularity = 1000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
 
 /*
  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  */
-static unsigned int sched_nr_latency = 5;
+static unsigned int sched_nr_latency = 8;
 
 /*
  * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 {
 	unsigned long delta_exec_weighted;
 
-	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+	schedstat_set(curr->statistics.exec_max,
+		      max((u64)delta_exec, curr->statistics.exec_max));
 
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
 }
 
 /*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->wait_max, max(se->wait_max,
-			rq_of(cfs_rq)->clock - se->wait_start));
-	schedstat_set(se->wait_count, se->wait_count + 1);
-	schedstat_set(se->wait_sum, se->wait_sum +
-			rq_of(cfs_rq)->clock - se->wait_start);
+	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+			rq_of(cfs_rq)->clock - se->statistics.wait_start));
+	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+			rq_of(cfs_rq)->clock - se->statistics.wait_start);
 #ifdef CONFIG_SCHEDSTATS
 	if (entity_is_task(se)) {
 		trace_sched_stat_wait(task_of(se),
-			rq_of(cfs_rq)->clock - se->wait_start);
+			rq_of(cfs_rq)->clock - se->statistics.wait_start);
 	}
 #endif
-	schedstat_set(se->wait_start, 0);
+	schedstat_set(se->statistics.wait_start, 0);
 }
 
 static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	if (entity_is_task(se))
 		tsk = task_of(se);
 
-	if (se->sleep_start) {
-		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+	if (se->statistics.sleep_start) {
+		u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
 
 		if ((s64)delta < 0)
 			delta = 0;
 
-		if (unlikely(delta > se->sleep_max))
-			se->sleep_max = delta;
+		if (unlikely(delta > se->statistics.sleep_max))
+			se->statistics.sleep_max = delta;
 
-		se->sleep_start = 0;
-		se->sum_sleep_runtime += delta;
+		se->statistics.sleep_start = 0;
+		se->statistics.sum_sleep_runtime += delta;
 
 		if (tsk) {
 			account_scheduler_latency(tsk, delta >> 10, 1);
 			trace_sched_stat_sleep(tsk, delta);
 		}
 	}
-	if (se->block_start) {
-		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+	if (se->statistics.block_start) {
+		u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
 
 		if ((s64)delta < 0)
 			delta = 0;
 
-		if (unlikely(delta > se->block_max))
-			se->block_max = delta;
+		if (unlikely(delta > se->statistics.block_max))
+			se->statistics.block_max = delta;
 
-		se->block_start = 0;
-		se->sum_sleep_runtime += delta;
+		se->statistics.block_start = 0;
+		se->statistics.sum_sleep_runtime += delta;
 
 		if (tsk) {
 			if (tsk->in_iowait) {
-				se->iowait_sum += delta;
-				se->iowait_count++;
+				se->statistics.iowait_sum += delta;
+				se->statistics.iowait_count++;
 				trace_sched_stat_iowait(tsk, delta);
 			}
 
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 		vruntime += sched_vslice(cfs_rq, se);
 
 	/* sleeps up to a single latency don't count. */
-	if (!initial && sched_feat(FAIR_SLEEPERS)) {
+	if (!initial) {
 		unsigned long thresh = sysctl_sched_latency;
 
 		/*
-		 * Convert the sleeper threshold into virtual time.
-		 * SCHED_IDLE is a special sub-class.  We care about
-		 * fairness only relative to other SCHED_IDLE tasks,
-		 * all of which have the same weight.
-		 */
-		if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
-				 task_of(se)->policy != SCHED_IDLE))
-			thresh = calc_delta_fair(thresh, se);
-
-		/*
 		 * Halve their sleep time's effect, to allow
 		 * for a gentler effect of sleepers:
 		 */
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	se->vruntime = vruntime;
 }
 
-#define ENQUEUE_WAKEUP	1
-#define ENQUEUE_MIGRATE 2
-
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update the normalized vruntime before updating min_vruntime
 	 * through callig update_curr().
 	 */
-	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
 		se->vruntime += cfs_rq->min_vruntime;
 
 	/*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	/*
 	 * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 	update_curr(cfs_rq);
 
 	update_stats_dequeue(cfs_rq, se);
-	if (sleep) {
+	if (flags & DEQUEUE_SLEEP) {
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
 
 			if (tsk->state & TASK_INTERRUPTIBLE)
-				se->sleep_start = rq_of(cfs_rq)->clock;
+				se->statistics.sleep_start = rq_of(cfs_rq)->clock;
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
-				se->block_start = rq_of(cfs_rq)->clock;
+				se->statistics.block_start = rq_of(cfs_rq)->clock;
 		}
 #endif
 	}
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 	 * update can refer to the ->curr item and we need to reflect this
 	 * movement in our normalized position.
 	 */
-	if (!sleep)
+	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
 }
 
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * when there are only lesser-weight tasks around):
 	 */
 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-		se->slice_max = max(se->slice_max,
+		se->statistics.slice_max = max(se->statistics.slice_max,
 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
 	}
 #endif
@@ -1053,16 +1041,11 @@ static inline void hrtick_update(struct rq *rq)
  * increased. Here we update the fair scheduling stats and
  * then put the task into the rbtree:
  */
-static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
-	int flags = 0;
-
-	if (wakeup)
-		flags |= ENQUEUE_WAKEUP;
-	if (p->state == TASK_WAKING)
-		flags |= ENQUEUE_MIGRATE;
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
@@ -1080,18 +1063,18 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
  * decreased. We remove the task from the rbtree and
  * update the fair scheduling stats:
  */
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		dequeue_entity(cfs_rq, se, sleep);
+		dequeue_entity(cfs_rq, se, flags);
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight)
 			break;
-		sleep = 1;
+		flags |= DEQUEUE_SLEEP;
 	}
 
 	hrtick_update(rq);
@@ -1239,11 +1222,9 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-	struct task_struct *curr = current;
 	unsigned long this_load, load;
 	int idx, this_cpu, prev_cpu;
 	unsigned long tl_per_task;
-	unsigned int imbalance;
 	struct task_group *tg;
 	unsigned long weight;
 	int balanced;
@@ -1254,23 +1235,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	load	  = source_load(prev_cpu, idx);
 	this_load = target_load(this_cpu, idx);
 
-	if (sync) {
-	       if (sched_feat(SYNC_LESS) &&
-		   (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-		    p->se.avg_overlap > sysctl_sched_migration_cost))
-		       sync = 0;
-	} else {
-		if (sched_feat(SYNC_MORE) &&
-		    (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-		     p->se.avg_overlap < sysctl_sched_migration_cost))
-			sync = 1;
-	}
-
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
 	 * effect of the currently running task from the load
 	 * of the current CPU:
 	 */
+	rcu_read_lock();
 	if (sync) {
 		tg = task_group(current);
 		weight = current->se.load.weight;
@@ -1282,8 +1252,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	tg = task_group(p);
 	weight = p->se.load.weight;
 
-	imbalance = 100 + (sd->imbalance_pct - 100) / 2;
-
 	/*
 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
 	 * due to the sync cause above having dropped this_load to 0, we'll
@@ -1293,9 +1261,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	 * Otherwise check if either cpus are near enough in load to allow this
 	 * task to be woken on this_cpu.
 	 */
-	balanced = !this_load ||
-		100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
-		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
+	if (this_load) {
+		unsigned long this_eff_load, prev_eff_load;
+
+		this_eff_load = 100;
+		this_eff_load *= power_of(prev_cpu);
+		this_eff_load *= this_load +
+			effective_load(tg, this_cpu, weight, weight);
+
+		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+		prev_eff_load *= power_of(this_cpu);
+		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+
+		balanced = this_eff_load <= prev_eff_load;
+	} else
+		balanced = true;
+	rcu_read_unlock();
 
 	/*
 	 * If the currently running task will sleep within
@@ -1305,7 +1286,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	if (sync && balanced)
 		return 1;
 
-	schedstat_inc(p, se.nr_wakeups_affine_attempts);
+	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
 
 	if (balanced ||
@@ -1317,7 +1298,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 		 * there is no bad imbalance.
 		 */
 		schedstat_inc(sd, ttwu_move_affine);
-		schedstat_inc(p, se.nr_wakeups_affine);
+		schedstat_inc(p, se.statistics.nr_wakeups_affine);
 
 		return 1;
 	}
@@ -1332,7 +1313,7 @@ static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		  int this_cpu, int load_idx)
 {
-	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+	struct sched_group *idlest = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
@@ -1367,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
 		if (local_group) {
 			this_load = avg_load;
-			this = group;
 		} else if (avg_load < min_load) {
 			min_load = avg_load;
 			idlest = group;
@@ -1405,29 +1385,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
-static int
-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_sibling(struct task_struct *p, int target)
 {
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
+	struct sched_domain *sd;
 	int i;
 
 	/*
-	 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
-	 * test in select_task_rq_fair) and the prev_cpu is idle then that's
-	 * always a better target than the current cpu.
+	 * If the task is going to be woken-up on this cpu and if it is
+	 * already idle, then it is the right target.
+	 */
+	if (target == cpu && idle_cpu(cpu))
+		return cpu;
+
+	/*
+	 * If the task is going to be woken-up on the cpu where it previously
+	 * ran and if it is currently idle, then it the right target.
 	 */
-	if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+	if (target == prev_cpu && idle_cpu(prev_cpu))
 		return prev_cpu;
 
 	/*
-	 * Otherwise, iterate the domain and find an elegible idle cpu.
+	 * Otherwise, iterate the domains and find an elegible idle cpu.
 	 */
-	for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
-		if (!cpu_rq(i)->cfs.nr_running) {
-			target = i;
+	for_each_domain(target, sd) {
+		if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
 			break;
+
+		for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+			if (idle_cpu(i)) {
+				target = i;
+				break;
+			}
 		}
+
+		/*
+		 * Lets stop looking for an idle sibling when we reached
+		 * the domain that spans the current cpu and prev_cpu.
+		 */
+		if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+		    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+			break;
 	}
 
 	return target;
@@ -1444,7 +1443,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
  *
  * preempt must be disabled.
  */
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+static int
+select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
 {
 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
@@ -1455,8 +1455,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE) {
-		if (sched_feat(AFFINE_WAKEUPS) &&
-		    cpumask_test_cpu(cpu, &p->cpus_allowed))
+		if (cpumask_test_cpu(cpu, &p->cpus_allowed))
 			want_affine = 1;
 		new_cpu = prev_cpu;
 	}
@@ -1490,34 +1489,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 		}
 
 		/*
-		 * While iterating the domains looking for a spanning
-		 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
-		 * in cache sharing domains along the way.
+		 * If both cpu and prev_cpu are part of this domain,
+		 * cpu is a valid SD_WAKE_AFFINE target.
 		 */
-		if (want_affine) {
-			int target = -1;
-
-			/*
-			 * If both cpu and prev_cpu are part of this domain,
-			 * cpu is a valid SD_WAKE_AFFINE target.
-			 */
-			if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
-				target = cpu;
-
-			/*
-			 * If there's an idle sibling in this domain, make that
-			 * the wake_affine target instead of the current cpu.
-			 */
-			if (tmp->flags & SD_SHARE_PKG_RESOURCES)
-				target = select_idle_sibling(p, tmp, target);
-
-			if (target >= 0) {
-				if (tmp->flags & SD_WAKE_AFFINE) {
-					affine_sd = tmp;
-					want_affine = 0;
-				}
-				cpu = target;
-			}
+		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+			affine_sd = tmp;
+			want_affine = 0;
 		}
 
 		if (!want_sd && !want_affine)
@@ -1530,22 +1508,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 			sd = tmp;
 	}
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	if (sched_feat(LB_SHARES_UPDATE)) {
 		/*
 		 * Pick the largest domain to update shares over
 		 */
 		tmp = sd;
-		if (affine_sd && (!tmp ||
-				  cpumask_weight(sched_domain_span(affine_sd)) >
-				  cpumask_weight(sched_domain_span(sd))))
+		if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
 			tmp = affine_sd;
 
-		if (tmp)
+		if (tmp) {
+			raw_spin_unlock(&rq->lock);
 			update_shares(tmp);
+			raw_spin_lock(&rq->lock);
+		}
 	}
+#endif
 
-	if (affine_sd && wake_affine(affine_sd, p, sync))
-		return cpu;
+	if (affine_sd) {
+		if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+			return select_idle_sibling(p, cpu);
+		else
+			return select_idle_sibling(p, prev_cpu);
+	}
 
 	while (sd) {
 		int load_idx = sd->forkexec_idx;
@@ -1575,10 +1560,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 
 		/* Now try balancing at a lower domain level of new_cpu */
 		cpu = new_cpu;
-		weight = cpumask_weight(sched_domain_span(sd));
+		weight = sd->span_weight;
 		sd = NULL;
 		for_each_domain(cpu, tmp) {
-			if (weight <= cpumask_weight(sched_domain_span(tmp)))
+			if (weight <= tmp->span_weight)
 				break;
 			if (tmp->flags & sd_flag)
 				sd = tmp;
@@ -1590,63 +1575,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 }
 #endif /* CONFIG_SMP */
 
-/*
- * Adaptive granularity
- *
- * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- * with the limit of wakeup_gran -- when it never does a wakeup.
- *
- * So the smaller avg_wakeup is the faster we want this task to preempt,
- * but we don't want to treat the preemptee unfairly and therefore allow it
- * to run for at least the amount of time we'd like to run.
- *
- * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- *
- * NOTE: we use *nr_running to scale with load, this nicely matches the
- *       degrading latency on load.
- */
-static unsigned long
-adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
-{
-	u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-	u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
-	u64 gran = 0;
-
-	if (this_run < expected_wakeup)
-		gran = expected_wakeup - this_run;
-
-	return min_t(s64, gran, sysctl_sched_wakeup_granularity);
-}
-
 static unsigned long
 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 {
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 
-	if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
-		gran = adaptive_gran(curr, se);
-
 	/*
 	 * Since its curr running now, convert the gran from real-time
 	 * to virtual-time in his units.
+	 *
+	 * By using 'se' instead of 'curr' we penalize light tasks, so
+	 * they get preempted easier. That is, if 'se' < 'curr' then
+	 * the resulting gran will be larger, therefore penalizing the
+	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
+	 * be smaller, again penalizing the lighter task.
+	 *
+	 * This is especially important for buddies when the leftmost
+	 * task is higher priority than the buddy.
 	 */
-	if (sched_feat(ASYM_GRAN)) {
-		/*
-		 * By using 'se' instead of 'curr' we penalize light tasks, so
-		 * they get preempted easier. That is, if 'se' < 'curr' then
-		 * the resulting gran will be larger, therefore penalizing the
-		 * lighter, if otoh 'se' > 'curr' then the resulting gran will
-		 * be smaller, again penalizing the lighter task.
-		 *
-		 * This is especially important for buddies when the leftmost
-		 * task is higher priority than the buddy.
-		 */
-		if (unlikely(se->load.weight != NICE_0_LOAD))
-			gran = calc_delta_fair(gran, se);
-	} else {
-		if (unlikely(curr->load.weight != NICE_0_LOAD))
-			gran = calc_delta_fair(gran, curr);
-	}
+	if (unlikely(se->load.weight != NICE_0_LOAD))
+		gran = calc_delta_fair(gran, se);
 
 	return gran;
 }
@@ -1704,7 +1652,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	int sync = wake_flags & WF_SYNC;
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
 
 	if (unlikely(rt_prio(p->prio)))
@@ -1737,14 +1684,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (unlikely(curr->policy == SCHED_IDLE))
 		goto preempt;
 
-	if (sched_feat(WAKEUP_SYNC) && sync)
-		goto preempt;
-
-	if (sched_feat(WAKEUP_OVERLAP) &&
-			se->avg_overlap < sysctl_sched_migration_cost &&
-			pse->avg_overlap < sysctl_sched_migration_cost)
-		goto preempt;
-
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
@@ -1815,57 +1754,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
  */
 
 /*
- * Load-balancing iterator. Note: while the runqueue stays locked
- * during the whole iteration, the current task might be
- * dequeued so the iterator has to be dequeue-safe. Here we
- * achieve that by always pre-iterating before returning
- * the current task:
+ * pull_task - move a task from a remote runqueue to the local runqueue.
+ * Both runqueues must be locked.
  */
-static struct task_struct *
-__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
+static void pull_task(struct rq *src_rq, struct task_struct *p,
+		      struct rq *this_rq, int this_cpu)
 {
-	struct task_struct *p = NULL;
-	struct sched_entity *se;
+	deactivate_task(src_rq, p, 0);
+	set_task_cpu(p, this_cpu);
+	activate_task(this_rq, p, 0);
+	check_preempt_curr(this_rq, p, 0);
+}
 
-	if (next == &cfs_rq->tasks)
-		return NULL;
+/*
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+static
+int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
+		     struct sched_domain *sd, enum cpu_idle_type idle,
+		     int *all_pinned)
+{
+	int tsk_cache_hot = 0;
+	/*
+	 * We do not migrate tasks that are:
+	 * 1) running (obviously), or
+	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
+	 * 3) are cache-hot on their current CPU.
+	 */
+	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
+		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+		return 0;
+	}
+	*all_pinned = 0;
 
-	se = list_entry(next, struct sched_entity, group_node);
-	p = task_of(se);
-	cfs_rq->balance_iterator = next->next;
+	if (task_running(rq, p)) {
+		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+		return 0;
+	}
 
-	return p;
-}
+	/*
+	 * Aggressive migration if:
+	 * 1) task is cache cold, or
+	 * 2) too many balance attempts have failed.
+	 */
 
-static struct task_struct *load_balance_start_fair(void *arg)
-{
-	struct cfs_rq *cfs_rq = arg;
+	tsk_cache_hot = task_hot(p, rq->clock, sd);
+	if (!tsk_cache_hot ||
+		sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+		if (tsk_cache_hot) {
+			schedstat_inc(sd, lb_hot_gained[idle]);
+			schedstat_inc(p, se.statistics.nr_forced_migrations);
+		}
+#endif
+		return 1;
+	}
 
-	return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
+	if (tsk_cache_hot) {
+		schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+		return 0;
+	}
+	return 1;
 }
 
-static struct task_struct *load_balance_next_fair(void *arg)
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int
+move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+	      struct sched_domain *sd, enum cpu_idle_type idle)
 {
-	struct cfs_rq *cfs_rq = arg;
+	struct task_struct *p, *n;
+	struct cfs_rq *cfs_rq;
+	int pinned = 0;
 
-	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
+	for_each_leaf_cfs_rq(busiest, cfs_rq) {
+		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+
+			if (!can_migrate_task(p, busiest, this_cpu,
+						sd, idle, &pinned))
+				continue;
+
+			pull_task(busiest, p, this_rq, this_cpu);
+			/*
+			 * Right now, this is only the second place pull_task()
+			 * is called, so we can safely collect pull_task()
+			 * stats here rather than inside pull_task().
+			 */
+			schedstat_inc(sd, lb_gained[idle]);
+			return 1;
+		}
+	}
+
+	return 0;
 }
 
 static unsigned long
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		unsigned long max_load_move, struct sched_domain *sd,
-		enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
-		struct cfs_rq *cfs_rq)
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+	      unsigned long max_load_move, struct sched_domain *sd,
+	      enum cpu_idle_type idle, int *all_pinned,
+	      int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
 {
-	struct rq_iterator cfs_rq_iterator;
+	int loops = 0, pulled = 0, pinned = 0;
+	long rem_load_move = max_load_move;
+	struct task_struct *p, *n;
 
-	cfs_rq_iterator.start = load_balance_start_fair;
-	cfs_rq_iterator.next = load_balance_next_fair;
-	cfs_rq_iterator.arg = cfs_rq;
+	if (max_load_move == 0)
+		goto out;
 
-	return balance_tasks(this_rq, this_cpu, busiest,
-			max_load_move, sd, idle, all_pinned,
-			this_best_prio, &cfs_rq_iterator);
+	pinned = 1;
+
+	list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
+		if (loops++ > sysctl_sched_nr_migrate)
+			break;
+
+		if ((p->se.load.weight >> 1) > rem_load_move ||
+		    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+			continue;
+
+		pull_task(busiest, p, this_rq, this_cpu);
+		pulled++;
+		rem_load_move -= p->se.load.weight;
+
+#ifdef CONFIG_PREEMPT
+		/*
+		 * NEWIDLE balancing is a source of latency, so preemptible
+		 * kernels will stop after the first task is pulled to minimize
+		 * the critical section.
+		 */
+		if (idle == CPU_NEWLY_IDLE)
+			break;
+#endif
+
+		/*
+		 * We only want to steal up to the prescribed amount of
+		 * weighted load.
+		 */
+		if (rem_load_move <= 0)
+			break;
+
+		if (p->prio < *this_best_prio)
+			*this_best_prio = p->prio;
+	}
+out:
+	/*
+	 * Right now, this is one of only two places pull_task() is called,
+	 * so we can safely collect pull_task() stats here rather than
+	 * inside pull_task().
+	 */
+	schedstat_add(sd, lb_gained[idle], pulled);
+
+	if (all_pinned)
+		*all_pinned = pinned;
+
+	return max_load_move - rem_load_move;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +1943,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		rem_load = (u64)rem_load_move * busiest_weight;
 		rem_load = div_u64(rem_load, busiest_h_load + 1);
 
-		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+		moved_load = balance_tasks(this_rq, this_cpu, busiest,
 				rem_load, sd, idle, all_pinned, this_best_prio,
-				tg->cfs_rq[busiest_cpu]);
+				busiest_cfs_rq);
 
 		if (!moved_load)
 			continue;
@@ -1922,36 +1968,1737 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
 		  int *all_pinned, int *this_best_prio)
 {
-	return __load_balance_fair(this_rq, this_cpu, busiest,
+	return balance_tasks(this_rq, this_cpu, busiest,
 			max_load_move, sd, idle, all_pinned,
 			this_best_prio, &busiest->cfs);
 }
 #endif
 
-static int
-move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		   struct sched_domain *sd, enum cpu_idle_type idle)
+/*
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		      unsigned long max_load_move,
+		      struct sched_domain *sd, enum cpu_idle_type idle,
+		      int *all_pinned)
 {
-	struct cfs_rq *busy_cfs_rq;
-	struct rq_iterator cfs_rq_iterator;
+	unsigned long total_load_moved = 0, load_moved;
+	int this_best_prio = this_rq->curr->prio;
 
-	cfs_rq_iterator.start = load_balance_start_fair;
-	cfs_rq_iterator.next = load_balance_next_fair;
+	do {
+		load_moved = load_balance_fair(this_rq, this_cpu, busiest,
+				max_load_move - total_load_moved,
+				sd, idle, all_pinned, &this_best_prio);
 
-	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+		total_load_moved += load_moved;
+
+#ifdef CONFIG_PREEMPT
 		/*
-		 * pass busy_cfs_rq argument into
-		 * load_balance_[start|next]_fair iterators
+		 * NEWIDLE balancing is a source of latency, so preemptible
+		 * kernels will stop after the first task is pulled to minimize
+		 * the critical section.
 		 */
-		cfs_rq_iterator.arg = busy_cfs_rq;
-		if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
-				       &cfs_rq_iterator))
-		    return 1;
+		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
+			break;
+
+		if (raw_spin_is_contended(&this_rq->lock) ||
+				raw_spin_is_contended(&busiest->lock))
+			break;
+#endif
+	} while (load_moved && max_load_move > total_load_moved);
+
+	return total_load_moved > 0;
+}
+
+/********** Helpers for find_busiest_group ************************/
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * 		during load balancing.
+ */
+struct sd_lb_stats {
+	struct sched_group *busiest; /* Busiest group in this sd */
+	struct sched_group *this;  /* Local group in this sd */
+	unsigned long total_load;  /* Total load of all groups in sd */
+	unsigned long total_pwr;   /*	Total power of all groups in sd */
+	unsigned long avg_load;	   /* Average load across all groups in sd */
+
+	/** Statistics of this group */
+	unsigned long this_load;
+	unsigned long this_load_per_task;
+	unsigned long this_nr_running;
+
+	/* Statistics of the busiest group */
+	unsigned long max_load;
+	unsigned long busiest_load_per_task;
+	unsigned long busiest_nr_running;
+	unsigned long busiest_group_capacity;
+
+	int group_imb; /* Is there imbalance in this sd */
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+	int power_savings_balance; /* Is powersave balance needed for this sd */
+	struct sched_group *group_min; /* Least loaded group in sd */
+	struct sched_group *group_leader; /* Group which relieves group_min */
+	unsigned long min_load_per_task; /* load_per_task in group_min */
+	unsigned long leader_nr_running; /* Nr running of group_leader */
+	unsigned long min_nr_running; /* Nr running of group_min */
+#endif
+};
+
+/*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+	unsigned long avg_load; /*Avg load across the CPUs of the group */
+	unsigned long group_load; /* Total load over the CPUs of the group */
+	unsigned long sum_nr_running; /* Nr tasks running in the group */
+	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+	unsigned long group_capacity;
+	int group_imb; /* Is there an imbalance in the group ? */
+};
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+	return cpumask_first(sched_group_cpus(group));
+}
+
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+					enum cpu_idle_type idle)
+{
+	int load_idx;
+
+	switch (idle) {
+	case CPU_NOT_IDLE:
+		load_idx = sd->busy_idx;
+		break;
+
+	case CPU_NEWLY_IDLE:
+		load_idx = sd->newidle_idx;
+		break;
+	default:
+		load_idx = sd->idle_idx;
+		break;
+	}
+
+	return load_idx;
+}
+
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+	struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+	/*
+	 * Busy processors will not participate in power savings
+	 * balance.
+	 */
+	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+		sds->power_savings_balance = 0;
+	else {
+		sds->power_savings_balance = 1;
+		sds->min_nr_running = ULONG_MAX;
+		sds->leader_nr_running = 0;
 	}
+}
+
+/**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ * 		load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+
+	if (!sds->power_savings_balance)
+		return;
+
+	/*
+	 * If the local group is idle or completely loaded
+	 * no need to do power savings balance at this domain
+	 */
+	if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+				!sds->this_nr_running))
+		sds->power_savings_balance = 0;
+
+	/*
+	 * If a group is already running at full capacity or idle,
+	 * don't include that group in power savings calculations
+	 */
+	if (!sds->power_savings_balance ||
+		sgs->sum_nr_running >= sgs->group_capacity ||
+		!sgs->sum_nr_running)
+		return;
+
+	/*
+	 * Calculate the group which has the least non-idle load.
+	 * This is the group from where we need to pick up the load
+	 * for saving power
+	 */
+	if ((sgs->sum_nr_running < sds->min_nr_running) ||
+	    (sgs->sum_nr_running == sds->min_nr_running &&
+	     group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+		sds->group_min = group;
+		sds->min_nr_running = sgs->sum_nr_running;
+		sds->min_load_per_task = sgs->sum_weighted_load /
+						sgs->sum_nr_running;
+	}
+
+	/*
+	 * Calculate the group which is almost near its
+	 * capacity but still has some space to pick up some load
+	 * from other group and save more power
+	 */
+	if (sgs->sum_nr_running + 1 > sgs->group_capacity)
+		return;
+
+	if (sgs->sum_nr_running > sds->leader_nr_running ||
+	    (sgs->sum_nr_running == sds->leader_nr_running &&
+	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+		sds->group_leader = group;
+		sds->leader_nr_running = sgs->sum_nr_running;
+	}
+}
+
+/**
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ * @sds: Variable containing the statistics of the sched_domain
+ *	under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+					int this_cpu, unsigned long *imbalance)
+{
+	if (!sds->power_savings_balance)
+		return 0;
+
+	if (sds->this != sds->group_leader ||
+			sds->group_leader == sds->group_min)
+		return 0;
+
+	*imbalance = sds->min_load_per_task;
+	sds->busiest = sds->group_min;
+
+	return 1;
+
+}
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+	struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+	return;
+}
+
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+	return;
+}
+
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+					int this_cpu, unsigned long *imbalance)
+{
+	return 0;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+
+
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	return default_scale_freq_power(sd, cpu);
+}
+
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+	unsigned long weight = sd->span_weight;
+	unsigned long smt_gain = sd->smt_gain;
+
+	smt_gain /= weight;
+
+	return smt_gain;
+}
+
+unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+	return default_scale_smt_power(sd, cpu);
+}
+
+unsigned long scale_rt_power(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	u64 total, available;
+
+	total = sched_avg_period() + (rq->clock - rq->age_stamp);
+	available = total - rq->rt_avg;
+
+	if (unlikely((s64)total < SCHED_LOAD_SCALE))
+		total = SCHED_LOAD_SCALE;
+
+	total >>= SCHED_LOAD_SHIFT;
+
+	return div_u64(available, total);
+}
+
+static void update_cpu_power(struct sched_domain *sd, int cpu)
+{
+	unsigned long weight = sd->span_weight;
+	unsigned long power = SCHED_LOAD_SCALE;
+	struct sched_group *sdg = sd->groups;
+
+	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+		if (sched_feat(ARCH_POWER))
+			power *= arch_scale_smt_power(sd, cpu);
+		else
+			power *= default_scale_smt_power(sd, cpu);
+
+		power >>= SCHED_LOAD_SHIFT;
+	}
+
+	sdg->cpu_power_orig = power;
+
+	if (sched_feat(ARCH_POWER))
+		power *= arch_scale_freq_power(sd, cpu);
+	else
+		power *= default_scale_freq_power(sd, cpu);
+
+	power >>= SCHED_LOAD_SHIFT;
+
+	power *= scale_rt_power(cpu);
+	power >>= SCHED_LOAD_SHIFT;
+
+	if (!power)
+		power = 1;
+
+	cpu_rq(cpu)->cpu_power = power;
+	sdg->cpu_power = power;
+}
+
+static void update_group_power(struct sched_domain *sd, int cpu)
+{
+	struct sched_domain *child = sd->child;
+	struct sched_group *group, *sdg = sd->groups;
+	unsigned long power;
+
+	if (!child) {
+		update_cpu_power(sd, cpu);
+		return;
+	}
+
+	power = 0;
+
+	group = child->groups;
+	do {
+		power += group->cpu_power;
+		group = group->next;
+	} while (group != child->groups);
+
+	sdg->cpu_power = power;
+}
+
+/*
+ * Try and fix up capacity for tiny siblings, this is needed when
+ * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ * which on its own isn't powerful enough.
+ *
+ * See update_sd_pick_busiest() and check_asym_packing().
+ */
+static inline int
+fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+{
+	/*
+	 * Only siblings can have significantly less than SCHED_LOAD_SCALE
+	 */
+	if (sd->level != SD_LV_SIBLING)
+		return 0;
+
+	/*
+	 * If ~90% of the cpu_power is still there, we're good.
+	 */
+	if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+		return 1;
 
 	return 0;
 }
 
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: The sched_domain whose statistics are to be updated.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_domain *sd,
+			struct sched_group *group, int this_cpu,
+			enum cpu_idle_type idle, int load_idx, int *sd_idle,
+			int local_group, const struct cpumask *cpus,
+			int *balance, struct sg_lb_stats *sgs)
+{
+	unsigned long load, max_cpu_load, min_cpu_load;
+	int i;
+	unsigned int balance_cpu = -1, first_idle_cpu = 0;
+	unsigned long avg_load_per_task = 0;
+
+	if (local_group)
+		balance_cpu = group_first_cpu(group);
+
+	/* Tally up the load of all CPUs in the group */
+	max_cpu_load = 0;
+	min_cpu_load = ~0UL;
+
+	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+		struct rq *rq = cpu_rq(i);
+
+		if (*sd_idle && rq->nr_running)
+			*sd_idle = 0;
+
+		/* Bias balancing toward cpus of our domain */
+		if (local_group) {
+			if (idle_cpu(i) && !first_idle_cpu) {
+				first_idle_cpu = 1;
+				balance_cpu = i;
+			}
+
+			load = target_load(i, load_idx);
+		} else {
+			load = source_load(i, load_idx);
+			if (load > max_cpu_load)
+				max_cpu_load = load;
+			if (min_cpu_load > load)
+				min_cpu_load = load;
+		}
+
+		sgs->group_load += load;
+		sgs->sum_nr_running += rq->nr_running;
+		sgs->sum_weighted_load += weighted_cpuload(i);
+
+	}
+
+	/*
+	 * First idle cpu or the first cpu(busiest) in this sched group
+	 * is eligible for doing load balancing at this and above
+	 * domains. In the newly idle case, we will allow all the cpu's
+	 * to do the newly idle load balance.
+	 */
+	if (idle != CPU_NEWLY_IDLE && local_group) {
+		if (balance_cpu != this_cpu) {
+			*balance = 0;
+			return;
+		}
+		update_group_power(sd, this_cpu);
+	}
+
+	/* Adjust by relative CPU power of the group */
+	sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+	/*
+	 * Consider the group unbalanced when the imbalance is larger
+	 * than the average weight of two tasks.
+	 *
+	 * APZ: with cgroup the avg task weight can vary wildly and
+	 *      might not be a suitable number - should we keep a
+	 *      normalized nr_running number somewhere that negates
+	 *      the hierarchy?
+	 */
+	if (sgs->sum_nr_running)
+		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+
+	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+		sgs->group_imb = 1;
+
+	sgs->group_capacity =
+		DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+	if (!sgs->group_capacity)
+		sgs->group_capacity = fix_small_capacity(sd, group);
+}
+
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @sd: sched_domain whose statistics are to be checked
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sgs: sched_group statistics
+ * @this_cpu: the current cpu
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ */
+static bool update_sd_pick_busiest(struct sched_domain *sd,
+				   struct sd_lb_stats *sds,
+				   struct sched_group *sg,
+				   struct sg_lb_stats *sgs,
+				   int this_cpu)
+{
+	if (sgs->avg_load <= sds->max_load)
+		return false;
+
+	if (sgs->sum_nr_running > sgs->group_capacity)
+		return true;
+
+	if (sgs->group_imb)
+		return true;
+
+	/*
+	 * ASYM_PACKING needs to move all the work to the lowest
+	 * numbered CPUs in the group, therefore mark all groups
+	 * higher than ourself as busy.
+	 */
+	if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+	    this_cpu < group_first_cpu(sg)) {
+		if (!sds->busiest)
+			return true;
+
+		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing sg.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+			enum cpu_idle_type idle, int *sd_idle,
+			const struct cpumask *cpus, int *balance,
+			struct sd_lb_stats *sds)
+{
+	struct sched_domain *child = sd->child;
+	struct sched_group *sg = sd->groups;
+	struct sg_lb_stats sgs;
+	int load_idx, prefer_sibling = 0;
+
+	if (child && child->flags & SD_PREFER_SIBLING)
+		prefer_sibling = 1;
+
+	init_sd_power_savings_stats(sd, sds, idle);
+	load_idx = get_sd_load_idx(sd, idle);
+
+	do {
+		int local_group;
+
+		local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
+		memset(&sgs, 0, sizeof(sgs));
+		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
+				local_group, cpus, balance, &sgs);
+
+		if (local_group && !(*balance))
+			return;
+
+		sds->total_load += sgs.group_load;
+		sds->total_pwr += sg->cpu_power;
+
+		/*
+		 * In case the child domain prefers tasks go to siblings
+		 * first, lower the sg capacity to one so that we'll try
+		 * and move all the excess tasks away.
+		 */
+		if (prefer_sibling)
+			sgs.group_capacity = min(sgs.group_capacity, 1UL);
+
+		if (local_group) {
+			sds->this_load = sgs.avg_load;
+			sds->this = sg;
+			sds->this_nr_running = sgs.sum_nr_running;
+			sds->this_load_per_task = sgs.sum_weighted_load;
+		} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
+			sds->max_load = sgs.avg_load;
+			sds->busiest = sg;
+			sds->busiest_nr_running = sgs.sum_nr_running;
+			sds->busiest_group_capacity = sgs.group_capacity;
+			sds->busiest_load_per_task = sgs.sum_weighted_load;
+			sds->group_imb = sgs.group_imb;
+		}
+
+		update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+		sg = sg->next;
+	} while (sg != sd->groups);
+}
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+       return 0*SD_ASYM_PACKING;
+}
+
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ *			sched doman.
+ *
+ * This is primarily intended to used at the sibling level.  Some
+ * cores like POWER7 prefer to use lower numbered SMT threads.  In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle.  When in lower SMT modes, the threads will
+ * perform better since they share less core resources.  Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads.  It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on.  Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * Returns 1 when packing is required and a task should be moved to
+ * this CPU.  The amount of the imbalance is returned in *imbalance.
+ *
+ * @sd: The sched_domain whose packing is to be checked.
+ * @sds: Statistics of the sched_domain which is to be packed
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: returns amount of imbalanced due to packing.
+ */
+static int check_asym_packing(struct sched_domain *sd,
+			      struct sd_lb_stats *sds,
+			      int this_cpu, unsigned long *imbalance)
+{
+	int busiest_cpu;
+
+	if (!(sd->flags & SD_ASYM_PACKING))
+		return 0;
+
+	if (!sds->busiest)
+		return 0;
+
+	busiest_cpu = group_first_cpu(sds->busiest);
+	if (this_cpu > busiest_cpu)
+		return 0;
+
+	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+				       SCHED_LOAD_SCALE);
+	return 1;
+}
+
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ *			amongst the groups of a sched_domain, during
+ *			load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+				int this_cpu, unsigned long *imbalance)
+{
+	unsigned long tmp, pwr_now = 0, pwr_move = 0;
+	unsigned int imbn = 2;
+	unsigned long scaled_busy_load_per_task;
+
+	if (sds->this_nr_running) {
+		sds->this_load_per_task /= sds->this_nr_running;
+		if (sds->busiest_load_per_task >
+				sds->this_load_per_task)
+			imbn = 1;
+	} else
+		sds->this_load_per_task =
+			cpu_avg_load_per_task(this_cpu);
+
+	scaled_busy_load_per_task = sds->busiest_load_per_task
+						 * SCHED_LOAD_SCALE;
+	scaled_busy_load_per_task /= sds->busiest->cpu_power;
+
+	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
+			(scaled_busy_load_per_task * imbn)) {
+		*imbalance = sds->busiest_load_per_task;
+		return;
+	}
+
+	/*
+	 * OK, we don't have enough imbalance to justify moving tasks,
+	 * however we may be able to increase total CPU power used by
+	 * moving them.
+	 */
+
+	pwr_now += sds->busiest->cpu_power *
+			min(sds->busiest_load_per_task, sds->max_load);
+	pwr_now += sds->this->cpu_power *
+			min(sds->this_load_per_task, sds->this_load);
+	pwr_now /= SCHED_LOAD_SCALE;
+
+	/* Amount of load we'd subtract */
+	tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+		sds->busiest->cpu_power;
+	if (sds->max_load > tmp)
+		pwr_move += sds->busiest->cpu_power *
+			min(sds->busiest_load_per_task, sds->max_load - tmp);
+
+	/* Amount of load we'd add */
+	if (sds->max_load * sds->busiest->cpu_power <
+		sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+		tmp = (sds->max_load * sds->busiest->cpu_power) /
+			sds->this->cpu_power;
+	else
+		tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+			sds->this->cpu_power;
+	pwr_move += sds->this->cpu_power *
+			min(sds->this_load_per_task, sds->this_load + tmp);
+	pwr_move /= SCHED_LOAD_SCALE;
+
+	/* Move if we gain throughput */
+	if (pwr_move > pwr_now)
+		*imbalance = sds->busiest_load_per_task;
+}
+
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ *			 groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+		unsigned long *imbalance)
+{
+	unsigned long max_pull, load_above_capacity = ~0UL;
+
+	sds->busiest_load_per_task /= sds->busiest_nr_running;
+	if (sds->group_imb) {
+		sds->busiest_load_per_task =
+			min(sds->busiest_load_per_task, sds->avg_load);
+	}
+
+	/*
+	 * In the presence of smp nice balancing, certain scenarios can have
+	 * max load less than avg load(as we skip the groups at or below
+	 * its cpu_power, while calculating max_load..)
+	 */
+	if (sds->max_load < sds->avg_load) {
+		*imbalance = 0;
+		return fix_small_imbalance(sds, this_cpu, imbalance);
+	}
+
+	if (!sds->group_imb) {
+		/*
+		 * Don't want to pull so many tasks that a group would go idle.
+		 */
+		load_above_capacity = (sds->busiest_nr_running -
+						sds->busiest_group_capacity);
+
+		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+
+		load_above_capacity /= sds->busiest->cpu_power;
+	}
+
+	/*
+	 * We're trying to get all the cpus to the average_load, so we don't
+	 * want to push ourselves above the average load, nor do we wish to
+	 * reduce the max loaded cpu below the average load. At the same time,
+	 * we also don't want to reduce the group load below the group capacity
+	 * (so that we can implement power-savings policies etc). Thus we look
+	 * for the minimum possible imbalance.
+	 * Be careful of negative numbers as they'll appear as very large values
+	 * with unsigned longs.
+	 */
+	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
+
+	/* How much load to actually move to equalise the imbalance */
+	*imbalance = min(max_pull * sds->busiest->cpu_power,
+		(sds->avg_load - sds->this_load) * sds->this->cpu_power)
+			/ SCHED_LOAD_SCALE;
+
+	/*
+	 * if *imbalance is less than the average load per runnable task
+	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * a think about bumping its value to force at least one task to be
+	 * moved
+	 */
+	if (*imbalance < sds->busiest_load_per_task)
+		return fix_small_imbalance(sds, this_cpu, imbalance);
+
+}
+/******* find_busiest_group() helpers end here *********************/
+
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ *		be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ *	is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns:	- the busiest group if imbalance exists.
+ *		- If no imbalance and user has opted for power-savings balance,
+ *		   return the least loaded group whose CPUs can be
+ *		   put to idle by rebalancing its tasks onto our group.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+		   unsigned long *imbalance, enum cpu_idle_type idle,
+		   int *sd_idle, const struct cpumask *cpus, int *balance)
+{
+	struct sd_lb_stats sds;
+
+	memset(&sds, 0, sizeof(sds));
+
+	/*
+	 * Compute the various statistics relavent for load balancing at
+	 * this level.
+	 */
+	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+					balance, &sds);
+
+	/* Cases where imbalance does not exist from POV of this_cpu */
+	/* 1) this_cpu is not the appropriate cpu to perform load balancing
+	 *    at this level.
+	 * 2) There is no busy sibling group to pull from.
+	 * 3) This group is the busiest group.
+	 * 4) This group is more busy than the avg busieness at this
+	 *    sched_domain.
+	 * 5) The imbalance is within the specified limit.
+	 */
+	if (!(*balance))
+		goto ret;
+
+	if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+	    check_asym_packing(sd, &sds, this_cpu, imbalance))
+		return sds.busiest;
+
+	if (!sds.busiest || sds.busiest_nr_running == 0)
+		goto out_balanced;
+
+	if (sds.this_load >= sds.max_load)
+		goto out_balanced;
+
+	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+
+	if (sds.this_load >= sds.avg_load)
+		goto out_balanced;
+
+	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+		goto out_balanced;
+
+	/* Looks like there is an imbalance. Compute it */
+	calculate_imbalance(&sds, this_cpu, imbalance);
+	return sds.busiest;
+
+out_balanced:
+	/*
+	 * There is no obvious imbalance. But check if we can do some balancing
+	 * to save power.
+	 */
+	if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+		return sds.busiest;
+ret:
+	*imbalance = 0;
+	return NULL;
+}
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static struct rq *
+find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
+		   enum cpu_idle_type idle, unsigned long imbalance,
+		   const struct cpumask *cpus)
+{
+	struct rq *busiest = NULL, *rq;
+	unsigned long max_load = 0;
+	int i;
+
+	for_each_cpu(i, sched_group_cpus(group)) {
+		unsigned long power = power_of(i);
+		unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+		unsigned long wl;
+
+		if (!capacity)
+			capacity = fix_small_capacity(sd, group);
+
+		if (!cpumask_test_cpu(i, cpus))
+			continue;
+
+		rq = cpu_rq(i);
+		wl = weighted_cpuload(i);
+
+		/*
+		 * When comparing with imbalance, use weighted_cpuload()
+		 * which is not scaled with the cpu power.
+		 */
+		if (capacity && rq->nr_running == 1 && wl > imbalance)
+			continue;
+
+		/*
+		 * For the load comparisons with the other cpu's, consider
+		 * the weighted_cpuload() scaled with the cpu power, so that
+		 * the load can be moved away from the cpu that is potentially
+		 * running at a lower capacity.
+		 */
+		wl = (wl * SCHED_LOAD_SCALE) / power;
+
+		if (wl > max_load) {
+			max_load = wl;
+			busiest = rq;
+		}
+	}
+
+	return busiest;
+}
+
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL	512
+
+/* Working cpumask for load_balance and load_balance_newidle. */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+			       int busiest_cpu, int this_cpu)
+{
+	if (idle == CPU_NEWLY_IDLE) {
+
+		/*
+		 * ASYM_PACKING needs to force migrate tasks from busy but
+		 * higher numbered CPUs in order to pack all tasks in the
+		 * lowest numbered CPUs.
+		 */
+		if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+			return 1;
+
+		/*
+		 * The only task running in a non-idle cpu can be moved to this
+		 * cpu in an attempt to completely freeup the other CPU
+		 * package.
+		 *
+		 * The package power saving logic comes from
+		 * find_busiest_group(). If there are no imbalance, then
+		 * f_b_g() will return NULL. However when sched_mc={1,2} then
+		 * f_b_g() will select a group from which a running task may be
+		 * pulled to this cpu in order to make the other package idle.
+		 * If there is no opportunity to make a package idle and if
+		 * there are no imbalance, then f_b_g() will return NULL and no
+		 * action will be taken in load_balance_newidle().
+		 *
+		 * Under normal task pull operation due to imbalance, there
+		 * will be more than one task in the source run queue and
+		 * move_tasks() will succeed.  ld_moved will be true and this
+		 * active balance code will not be triggered.
+		 */
+		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+			return 0;
+
+		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+			return 0;
+	}
+
+	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+}
+
+static int active_load_balance_cpu_stop(void *data);
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ */
+static int load_balance(int this_cpu, struct rq *this_rq,
+			struct sched_domain *sd, enum cpu_idle_type idle,
+			int *balance)
+{
+	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+	struct sched_group *group;
+	unsigned long imbalance;
+	struct rq *busiest;
+	unsigned long flags;
+	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+
+	cpumask_copy(cpus, cpu_active_mask);
+
+	/*
+	 * When power savings policy is enabled for the parent domain, idle
+	 * sibling can pick up load irrespective of busy siblings. In this case,
+	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
+	 * portraying it as CPU_NOT_IDLE.
+	 */
+	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+		sd_idle = 1;
+
+	schedstat_inc(sd, lb_count[idle]);
+
+redo:
+	update_shares(sd);
+	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+				   cpus, balance);
+
+	if (*balance == 0)
+		goto out_balanced;
+
+	if (!group) {
+		schedstat_inc(sd, lb_nobusyg[idle]);
+		goto out_balanced;
+	}
+
+	busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
+	if (!busiest) {
+		schedstat_inc(sd, lb_nobusyq[idle]);
+		goto out_balanced;
+	}
+
+	BUG_ON(busiest == this_rq);
+
+	schedstat_add(sd, lb_imbalance[idle], imbalance);
+
+	ld_moved = 0;
+	if (busiest->nr_running > 1) {
+		/*
+		 * Attempt to move tasks. If find_busiest_group has found
+		 * an imbalance but busiest->nr_running <= 1, the group is
+		 * still unbalanced. ld_moved simply stays zero, so it is
+		 * correctly treated as an imbalance.
+		 */
+		local_irq_save(flags);
+		double_rq_lock(this_rq, busiest);
+		ld_moved = move_tasks(this_rq, this_cpu, busiest,
+				      imbalance, sd, idle, &all_pinned);
+		double_rq_unlock(this_rq, busiest);
+		local_irq_restore(flags);
+
+		/*
+		 * some other cpu did the load balance for us.
+		 */
+		if (ld_moved && this_cpu != smp_processor_id())
+			resched_cpu(this_cpu);
+
+		/* All tasks on this runqueue were pinned by CPU affinity */
+		if (unlikely(all_pinned)) {
+			cpumask_clear_cpu(cpu_of(busiest), cpus);
+			if (!cpumask_empty(cpus))
+				goto redo;
+			goto out_balanced;
+		}
+	}
+
+	if (!ld_moved) {
+		schedstat_inc(sd, lb_failed[idle]);
+		sd->nr_balance_failed++;
+
+		if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
+					this_cpu)) {
+			raw_spin_lock_irqsave(&busiest->lock, flags);
+
+			/* don't kick the active_load_balance_cpu_stop,
+			 * if the curr task on busiest cpu can't be
+			 * moved to this_cpu
+			 */
+			if (!cpumask_test_cpu(this_cpu,
+					      &busiest->curr->cpus_allowed)) {
+				raw_spin_unlock_irqrestore(&busiest->lock,
+							    flags);
+				all_pinned = 1;
+				goto out_one_pinned;
+			}
+
+			/*
+			 * ->active_balance synchronizes accesses to
+			 * ->active_balance_work.  Once set, it's cleared
+			 * only after active load balance is finished.
+			 */
+			if (!busiest->active_balance) {
+				busiest->active_balance = 1;
+				busiest->push_cpu = this_cpu;
+				active_balance = 1;
+			}
+			raw_spin_unlock_irqrestore(&busiest->lock, flags);
+
+			if (active_balance)
+				stop_one_cpu_nowait(cpu_of(busiest),
+					active_load_balance_cpu_stop, busiest,
+					&busiest->active_balance_work);
+
+			/*
+			 * We've kicked active balancing, reset the failure
+			 * counter.
+			 */
+			sd->nr_balance_failed = sd->cache_nice_tries+1;
+		}
+	} else
+		sd->nr_balance_failed = 0;
+
+	if (likely(!active_balance)) {
+		/* We were unbalanced, so reset the balancing interval */
+		sd->balance_interval = sd->min_interval;
+	} else {
+		/*
+		 * If we've begun active balancing, start to back off. This
+		 * case may not be covered by the all_pinned logic if there
+		 * is only 1 task on the busy runqueue (because we don't call
+		 * move_tasks).
+		 */
+		if (sd->balance_interval < sd->max_interval)
+			sd->balance_interval *= 2;
+	}
+
+	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+		ld_moved = -1;
+
+	goto out;
+
+out_balanced:
+	schedstat_inc(sd, lb_balanced[idle]);
+
+	sd->nr_balance_failed = 0;
+
+out_one_pinned:
+	/* tune up the balancing interval */
+	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+			(sd->balance_interval < sd->max_interval))
+		sd->balance_interval *= 2;
+
+	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+		ld_moved = -1;
+	else
+		ld_moved = 0;
+out:
+	if (ld_moved)
+		update_shares(sd);
+	return ld_moved;
+}
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static void idle_balance(int this_cpu, struct rq *this_rq)
+{
+	struct sched_domain *sd;
+	int pulled_task = 0;
+	unsigned long next_balance = jiffies + HZ;
+
+	this_rq->idle_stamp = this_rq->clock;
+
+	if (this_rq->avg_idle < sysctl_sched_migration_cost)
+		return;
+
+	/*
+	 * Drop the rq->lock, but keep IRQ/preempt disabled.
+	 */
+	raw_spin_unlock(&this_rq->lock);
+
+	for_each_domain(this_cpu, sd) {
+		unsigned long interval;
+		int balance = 1;
+
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			continue;
+
+		if (sd->flags & SD_BALANCE_NEWIDLE) {
+			/* If we've pulled tasks over stop searching: */
+			pulled_task = load_balance(this_cpu, this_rq,
+						   sd, CPU_NEWLY_IDLE, &balance);
+		}
+
+		interval = msecs_to_jiffies(sd->balance_interval);
+		if (time_after(next_balance, sd->last_balance + interval))
+			next_balance = sd->last_balance + interval;
+		if (pulled_task) {
+			this_rq->idle_stamp = 0;
+			break;
+		}
+	}
+
+	raw_spin_lock(&this_rq->lock);
+
+	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
+		/*
+		 * We are going idle. next_balance may be set based on
+		 * a busy processor. So reset next_balance.
+		 */
+		this_rq->next_balance = next_balance;
+	}
+}
+
+/*
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
+ */
+static int active_load_balance_cpu_stop(void *data)
+{
+	struct rq *busiest_rq = data;
+	int busiest_cpu = cpu_of(busiest_rq);
+	int target_cpu = busiest_rq->push_cpu;
+	struct rq *target_rq = cpu_rq(target_cpu);
+	struct sched_domain *sd;
+
+	raw_spin_lock_irq(&busiest_rq->lock);
+
+	/* make sure the requested cpu hasn't gone down in the meantime */
+	if (unlikely(busiest_cpu != smp_processor_id() ||
+		     !busiest_rq->active_balance))
+		goto out_unlock;
+
+	/* Is there any task to move? */
+	if (busiest_rq->nr_running <= 1)
+		goto out_unlock;
+
+	/*
+	 * This condition is "impossible", if it occurs
+	 * we need to fix it. Originally reported by
+	 * Bjorn Helgaas on a 128-cpu setup.
+	 */
+	BUG_ON(busiest_rq == target_rq);
+
+	/* move a task from busiest_rq to target_rq */
+	double_lock_balance(busiest_rq, target_rq);
+
+	/* Search for an sd spanning us and the target CPU. */
+	for_each_domain(target_cpu, sd) {
+		if ((sd->flags & SD_LOAD_BALANCE) &&
+		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+				break;
+	}
+
+	if (likely(sd)) {
+		schedstat_inc(sd, alb_count);
+
+		if (move_one_task(target_rq, target_cpu, busiest_rq,
+				  sd, CPU_IDLE))
+			schedstat_inc(sd, alb_pushed);
+		else
+			schedstat_inc(sd, alb_failed);
+	}
+	double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+	busiest_rq->active_balance = 0;
+	raw_spin_unlock_irq(&busiest_rq->lock);
+	return 0;
+}
+
+#ifdef CONFIG_NO_HZ
+
+static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
+
+static void trigger_sched_softirq(void *data)
+{
+	raise_softirq_irqoff(SCHED_SOFTIRQ);
+}
+
+static inline void init_sched_softirq_csd(struct call_single_data *csd)
+{
+	csd->func = trigger_sched_softirq;
+	csd->info = NULL;
+	csd->flags = 0;
+	csd->priv = 0;
+}
+
+/*
+ * idle load balancing details
+ * - One of the idle CPUs nominates itself as idle load_balancer, while
+ *   entering idle.
+ * - This idle load balancer CPU will also go into tickless mode when
+ *   it is idle, just like all other idle CPUs
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ *   needed, they will kick the idle load balancer, which then does idle
+ *   load balancing for all the idle CPUs.
+ */
+static struct {
+	atomic_t load_balancer;
+	atomic_t first_pick_cpu;
+	atomic_t second_pick_cpu;
+	cpumask_var_t idle_cpus_mask;
+	cpumask_var_t grp_idle_mask;
+	unsigned long next_balance;     /* in jiffy units */
+} nohz ____cacheline_aligned;
+
+int get_nohz_load_balancer(void)
+{
+	return atomic_read(&nohz.load_balancer);
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:	The cpu whose lowest level of sched domain is to
+ *		be returned.
+ * @flag:	The flag to check for the lowest sched_domain
+ *		for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+	struct sched_domain *sd;
+
+	for_each_domain(cpu, sd)
+		if (sd && (sd->flags & flag))
+			break;
+
+	return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:	The cpu whose domains we're iterating over.
+ * @sd:		variable holding the value of the power_savings_sd
+ *		for cpu.
+ * @flag:	The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+	for (sd = lowest_flag_domain(cpu, flag); \
+		(sd && (sd->flags & flag)); sd = sd->parent)
+
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group:	group to be checked for semi-idleness
+ *
+ * Returns:	1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+	cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
+					sched_group_cpus(ilb_group));
+
+	/*
+	 * A sched_group is semi-idle when it has atleast one busy cpu
+	 * and atleast one idle cpu.
+	 */
+	if (cpumask_empty(nohz.grp_idle_mask))
+		return 0;
+
+	if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
+		return 0;
+
+	return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:	The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:	Returns the id of the idle load balancer if it exists,
+ *		Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+	struct sched_domain *sd;
+	struct sched_group *ilb_group;
+
+	/*
+	 * Have idle load balancer selection from semi-idle packages only
+	 * when power-aware load balancing is enabled
+	 */
+	if (!(sched_smt_power_savings || sched_mc_power_savings))
+		goto out_done;
+
+	/*
+	 * Optimize for the case when we have no idle CPUs or only one
+	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
+	 */
+	if (cpumask_weight(nohz.idle_cpus_mask) < 2)
+		goto out_done;
+
+	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+		ilb_group = sd->groups;
+
+		do {
+			if (is_semi_idle_group(ilb_group))
+				return cpumask_first(nohz.grp_idle_mask);
+
+			ilb_group = ilb_group->next;
+
+		} while (ilb_group != sd->groups);
+	}
+
+out_done:
+	return nr_cpu_ids;
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+	return nr_cpu_ids;
+}
+#endif
+
+/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void nohz_balancer_kick(int cpu)
+{
+	int ilb_cpu;
+
+	nohz.next_balance++;
+
+	ilb_cpu = get_nohz_load_balancer();
+
+	if (ilb_cpu >= nr_cpu_ids) {
+		ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+		if (ilb_cpu >= nr_cpu_ids)
+			return;
+	}
+
+	if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+		struct call_single_data *cp;
+
+		cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+		cp = &per_cpu(remote_sched_softirq_cb, cpu);
+		__smp_call_function_single(ilb_cpu, cp, 0);
+	}
+	return;
+}
+
+/*
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus.
+ *
+ * When the ilb owner becomes busy, we will not have new ilb owner until some
+ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
+ * idle load balancing by kicking one of the idle CPUs.
+ *
+ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
+ * ilb owner CPU in future (when there is a need for idle load balancing on
+ * behalf of all idle CPUs).
+ */
+void select_nohz_load_balancer(int stop_tick)
+{
+	int cpu = smp_processor_id();
+
+	if (stop_tick) {
+		if (!cpu_active(cpu)) {
+			if (atomic_read(&nohz.load_balancer) != cpu)
+				return;
+
+			/*
+			 * If we are going offline and still the leader,
+			 * give up!
+			 */
+			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+					   nr_cpu_ids) != cpu)
+				BUG();
+
+			return;
+		}
+
+		cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+
+		if (atomic_read(&nohz.first_pick_cpu) == cpu)
+			atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+		if (atomic_read(&nohz.second_pick_cpu) == cpu)
+			atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+
+		if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
+			int new_ilb;
+
+			/* make me the ilb owner */
+			if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
+					   cpu) != nr_cpu_ids)
+				return;
+
+			/*
+			 * Check to see if there is a more power-efficient
+			 * ilb.
+			 */
+			new_ilb = find_new_ilb(cpu);
+			if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+				atomic_set(&nohz.load_balancer, nr_cpu_ids);
+				resched_cpu(new_ilb);
+				return;
+			}
+			return;
+		}
+	} else {
+		if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+			return;
+
+		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+
+		if (atomic_read(&nohz.load_balancer) == cpu)
+			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+					   nr_cpu_ids) != cpu)
+				BUG();
+	}
+	return;
+}
+#endif
+
+static DEFINE_SPINLOCK(balancing);
+
+/*
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+{
+	int balance = 1;
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long interval;
+	struct sched_domain *sd;
+	/* Earliest time when we have to do rebalance again */
+	unsigned long next_balance = jiffies + 60*HZ;
+	int update_next_balance = 0;
+	int need_serialize;
+
+	for_each_domain(cpu, sd) {
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			continue;
+
+		interval = sd->balance_interval;
+		if (idle != CPU_IDLE)
+			interval *= sd->busy_factor;
+
+		/* scale ms to jiffies */
+		interval = msecs_to_jiffies(interval);
+		if (unlikely(!interval))
+			interval = 1;
+		if (interval > HZ*NR_CPUS/10)
+			interval = HZ*NR_CPUS/10;
+
+		need_serialize = sd->flags & SD_SERIALIZE;
+
+		if (need_serialize) {
+			if (!spin_trylock(&balancing))
+				goto out;
+		}
+
+		if (time_after_eq(jiffies, sd->last_balance + interval)) {
+			if (load_balance(cpu, rq, sd, idle, &balance)) {
+				/*
+				 * We've pulled tasks over so either we're no
+				 * longer idle, or one of our SMT siblings is
+				 * not idle.
+				 */
+				idle = CPU_NOT_IDLE;
+			}
+			sd->last_balance = jiffies;
+		}
+		if (need_serialize)
+			spin_unlock(&balancing);
+out:
+		if (time_after(next_balance, sd->last_balance + interval)) {
+			next_balance = sd->last_balance + interval;
+			update_next_balance = 1;
+		}
+
+		/*
+		 * Stop the load balance at this level. There is another
+		 * CPU in our sched group which is doing load balancing more
+		 * actively.
+		 */
+		if (!balance)
+			break;
+	}
+
+	/*
+	 * next_balance will be updated only when there is a need.
+	 * When the cpu is attached to null domain for ex, it will not be
+	 * updated.
+	 */
+	if (likely(update_next_balance))
+		rq->next_balance = next_balance;
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+{
+	struct rq *this_rq = cpu_rq(this_cpu);
+	struct rq *rq;
+	int balance_cpu;
+
+	if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+		return;
+
+	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+		if (balance_cpu == this_cpu)
+			continue;
+
+		/*
+		 * If this cpu gets work to do, stop the load balancing
+		 * work being done for other cpus. Next load
+		 * balancing owner will pick it up.
+		 */
+		if (need_resched()) {
+			this_rq->nohz_balance_kick = 0;
+			break;
+		}
+
+		raw_spin_lock_irq(&this_rq->lock);
+		update_rq_clock(this_rq);
+		update_cpu_load(this_rq);
+		raw_spin_unlock_irq(&this_rq->lock);
+
+		rebalance_domains(balance_cpu, CPU_IDLE);
+
+		rq = cpu_rq(balance_cpu);
+		if (time_after(this_rq->next_balance, rq->next_balance))
+			this_rq->next_balance = rq->next_balance;
+	}
+	nohz.next_balance = this_rq->next_balance;
+	this_rq->nohz_balance_kick = 0;
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer
+ * - first_pick_cpu is the one of the busy CPUs. It will kick
+ *   idle load balancer when it has more than one process active. This
+ *   eliminates the need for idle load balancing altogether when we have
+ *   only one running process in the system (common case).
+ * - If there are more than one busy CPU, idle load balancer may have
+ *   to run for active_load_balance to happen (i.e., two busy CPUs are
+ *   SMT or core siblings and can run better if they move to different
+ *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
+ *   which will kick idle load balancer as soon as it has any load.
+ */
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
+{
+	unsigned long now = jiffies;
+	int ret;
+	int first_pick_cpu, second_pick_cpu;
+
+	if (time_before(now, nohz.next_balance))
+		return 0;
+
+	if (rq->idle_at_tick)
+		return 0;
+
+	first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+	second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+
+	if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+	    second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+		return 0;
+
+	ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+	if (ret == nr_cpu_ids || ret == cpu) {
+		atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+		if (rq->nr_running > 1)
+			return 1;
+	} else {
+		ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+		if (ret == nr_cpu_ids || ret == cpu) {
+			if (rq->nr_running)
+				return 1;
+		}
+	}
+	return 0;
+}
+#else
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+#endif
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+	int this_cpu = smp_processor_id();
+	struct rq *this_rq = cpu_rq(this_cpu);
+	enum cpu_idle_type idle = this_rq->idle_at_tick ?
+						CPU_IDLE : CPU_NOT_IDLE;
+
+	rebalance_domains(this_cpu, idle);
+
+	/*
+	 * If this cpu has a pending nohz_balance_kick, then do the
+	 * balancing on behalf of the other idle cpus whose ticks are
+	 * stopped.
+	 */
+	nohz_idle_balance(this_cpu, idle);
+}
+
+static inline int on_null_domain(int cpu)
+{
+	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ */
+static inline void trigger_load_balance(struct rq *rq, int cpu)
+{
+	/* Don't need to rebalance while attached to NULL domain */
+	if (time_after_eq(jiffies, rq->next_balance) &&
+	    likely(!on_null_domain(cpu)))
+		raise_softirq(SCHED_SOFTIRQ);
+#ifdef CONFIG_NO_HZ
+	else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+		nohz_balancer_kick(cpu);
+#endif
+}
+
 static void rq_online_fair(struct rq *rq)
 {
 	update_sysctl();
@@ -1962,6 +3709,15 @@ static void rq_offline_fair(struct rq *rq)
 	update_sysctl();
 }
 
+#else	/* CONFIG_SMP */
+
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void idle_balance(int cpu, struct rq *rq)
+{
+}
+
 #endif /* CONFIG_SMP */
 
 /*
@@ -1993,6 +3749,8 @@ static void task_fork_fair(struct task_struct *p)
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
 
+	update_rq_clock(rq);
+
 	if (unlikely(task_cpu(p) != this_cpu))
 		__set_task_cpu(p, this_cpu);
 
@@ -2076,7 +3834,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
 }
 #endif
 
-unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
+static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
 	struct sched_entity *se = &task->se;
 	unsigned int rr_interval = 0;
@@ -2108,8 +3866,6 @@ static const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_fair,
 
-	.load_balance		= load_balance_fair,
-	.move_one_task		= move_one_task_fair,
 	.rq_online		= rq_online_fair,
 	.rq_offline		= rq_offline_fair,
 
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d..83c66e8ad3e 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
 /*
- * Disregards a certain amount of sleep time (sched_latency_ns) and
- * considers the task to be running during that period. This gives it
- * a service deficit on wakeup, allowing it to run sooner.
- */
-SCHED_FEAT(FAIR_SLEEPERS, 1)
-
-/*
  * Only give sleepers 50% of their service deficit. This allows
  * them to run sooner, but does not allow tons of sleepers to
  * rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
 SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
 
 /*
- * By not normalizing the sleep time, heavy tasks get an effective
- * longer period, and lighter task an effective shorter period they
- * are considered running.
- */
-SCHED_FEAT(NORMALIZED_SLEEPER, 0)
-
-/*
  * Place new tasks ahead so that they do not starve already running
  * tasks
  */
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 
 /*
- * Compute wakeup_gran based on task behaviour, clipped to
- *  [0, sched_wakeup_gran_ns]
- */
-SCHED_FEAT(ADAPTIVE_GRAN, 1)
-
-/*
- * When converting the wakeup granularity to virtual time, do it such
- * that heavier tasks preempting a lighter task have an edge.
- */
-SCHED_FEAT(ASYM_GRAN, 1)
-
-/*
- * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
- */
-SCHED_FEAT(WAKEUP_SYNC, 0)
-
-/*
- * Wakeup preempt based on task behaviour. Tasks that do not overlap
- * don't get preempted.
- */
-SCHED_FEAT(WAKEUP_OVERLAP, 0)
-
-/*
- * Use the SYNC wakeup hint, pipes and the likes use this to indicate
- * the remote end is likely to consume the data we just wrote, and
- * therefore has cache benefit from being placed on the same cpu, see
- * also AFFINE_WAKEUPS.
- */
-SCHED_FEAT(SYNC_WAKEUPS, 1)
-
-/*
  * Based on load and program behaviour, see if it makes sense to place
  * a newly woken task on the same cpu as the task that woke it --
  * improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
 
 /*
- * Weaken SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_LESS, 1)
-
-/*
- * Add SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_MORE, 0)
-
-/*
  * Prefer to schedule the task we woke last (assuming it failed
  * wakeup-preemption), since its likely going to consume data we
  * touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d38..9fa0f402c87 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
  */
 
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	schedstat_inc(rq, sched_goidle);
-	/* adjust the active tasks as we might go into a long sleep */
-	calc_load_account_active(rq);
+	calc_load_account_idle(rq);
 	return rq->idle;
 }
 
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
  * message if some code attempts to do it:
  */
 static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 {
 	raw_spin_unlock_irq(&rq->lock);
 	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
 }
 
-#ifdef CONFIG_SMP
-static unsigned long
-load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		  unsigned long max_load_move,
-		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *all_pinned, int *this_best_prio)
-{
-	return 0;
-}
-
-static int
-move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		   struct sched_domain *sd, enum cpu_idle_type idle)
-{
-	return 0;
-}
-#endif
-
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
 		check_preempt_curr(rq, p, 0);
 }
 
-unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
+static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
 {
 	return 0;
 }
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_idle,
-
-	.load_balance		= load_balance_idle,
-	.move_one_task		= move_one_task_idle,
 #endif
 
 	.set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216..d10c80ebb67 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 	return rt_se->my_q;
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+	int this_cpu = smp_processor_id();
 	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-	struct sched_rt_entity *rt_se = rt_rq->rt_se;
+	struct sched_rt_entity *rt_se;
+
+	rt_se = rt_rq->tg->rt_se[this_cpu];
 
 	if (rt_rq->rt_nr_running) {
 		if (rt_se && !on_rt_rq(rt_se))
-			enqueue_rt_entity(rt_se);
+			enqueue_rt_entity(rt_se, false);
 		if (rt_rq->highest_prio.curr < curr->prio)
 			resched_task(curr);
 	}
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 
 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
-	struct sched_rt_entity *rt_se = rt_rq->rt_se;
+	int this_cpu = smp_processor_id();
+	struct sched_rt_entity *rt_se;
+
+	rt_se = rt_rq->tg->rt_se[this_cpu];
 
 	if (rt_se && on_rt_rq(rt_se))
 		dequeue_rt_entity(rt_se);
@@ -607,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
 	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
 
-	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+	schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
 
 	curr->se.sum_exec_runtime += delta_exec;
 	account_group_exec_runtime(curr, delta_exec);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	dec_rt_group(rt_se, rt_rq);
 }
 
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
 	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 		return;
 
-	list_add_tail(&rt_se->run_list, queue);
+	if (head)
+		list_add(&rt_se->run_list, queue);
+	else
+		list_add_tail(&rt_se->run_list, queue);
 	__set_bit(rt_se_prio(rt_se), array->bitmap);
 
 	inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 	}
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
 	dequeue_rt_stack(rt_se);
 	for_each_sched_rt_entity(rt_se)
-		__enqueue_rt_entity(rt_se);
+		__enqueue_rt_entity(rt_se, head);
 }
 
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,27 +880,28 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 		struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
 		if (rt_rq && rt_rq->rt_nr_running)
-			__enqueue_rt_entity(rt_se);
+			__enqueue_rt_entity(rt_se, false);
 	}
 }
 
 /*
  * Adding/removing a task to/from a priority array:
  */
-static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
-	if (wakeup)
+	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
-	enqueue_rt_entity(rt_se);
+	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 
 	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 }
 
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
@@ -938,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
-	struct rq *rq = task_rq(p);
-
 	if (sd_flag != SD_BALANCE_WAKE)
 		return smp_processor_id();
 
@@ -1136,7 +1145,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 		if (next && next->prio < idx)
 			continue;
 		list_for_each_entry(rt_se, array->queue + idx, run_list) {
-			struct task_struct *p = rt_task_of(rt_se);
+			struct task_struct *p;
+
+			if (!rt_entity_is_task(rt_se))
+				continue;
+
+			p = rt_task_of(rt_se);
 			if (pick_rt_task(rq, p, cpu)) {
 				next = p;
 				break;
@@ -1481,24 +1495,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 		push_rt_tasks(rq);
 }
 
-static unsigned long
-load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		unsigned long max_load_move,
-		struct sched_domain *sd, enum cpu_idle_type idle,
-		int *all_pinned, int *this_best_prio)
-{
-	/* don't touch RT tasks */
-	return 0;
-}
-
-static int
-move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		 struct sched_domain *sd, enum cpu_idle_type idle)
-{
-	/* don't touch RT tasks */
-	return 0;
-}
-
 static void set_cpus_allowed_rt(struct task_struct *p,
 				const struct cpumask *new_mask)
 {
@@ -1667,11 +1663,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 {
 	unsigned long soft, hard;
 
-	if (!p->signal)
-		return;
-
-	soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
-	hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
+	/* max may change after cur was read, this will be fixed next tick */
+	soft = task_rlimit(p, RLIMIT_RTTIME);
+	hard = task_rlimit_max(p, RLIMIT_RTTIME);
 
 	if (soft != RLIM_INFINITY) {
 		unsigned long next;
@@ -1721,7 +1715,7 @@ static void set_curr_task_rt(struct rq *rq)
 	dequeue_pushable_task(rq, p);
 }
 
-unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
+static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 {
 	/*
 	 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1740,6 @@ static const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_rt,
 
-	.load_balance		= load_balance_rt,
-	.move_one_task		= move_one_task_rt,
 	.set_cpus_allowed       = set_cpus_allowed_rt,
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 32d2bd4061b..25c2f962f6f 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 static inline void account_group_user_time(struct task_struct *tsk,
 					   cputime_t cputime)
 {
-	struct thread_group_cputimer *cputimer;
-
-	/* tsk == current, ensure it is safe to use ->signal */
-	if (unlikely(tsk->exit_state))
-		return;
-
-	cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
 	if (!cputimer->running)
 		return;
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
 static inline void account_group_system_time(struct task_struct *tsk,
 					     cputime_t cputime)
 {
-	struct thread_group_cputimer *cputimer;
-
-	/* tsk == current, ensure it is safe to use ->signal */
-	if (unlikely(tsk->exit_state))
-		return;
-
-	cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
 	if (!cputimer->running)
 		return;
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 static inline void account_group_exec_runtime(struct task_struct *tsk,
 					      unsigned long long ns)
 {
-	struct thread_group_cputimer *cputimer;
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	/* see __exit_signal()->task_rq_unlock_wait() */
-	barrier();
-	if (unlikely(!sig))
-		return;
-
-	cputimer = &sig->cputimer;
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
 	if (!cputimer->running)
 		return;
diff --git a/kernel/signal.c b/kernel/signal.c
index 934ae5e687b..919562c3d6b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -159,6 +159,10 @@ void recalc_sigpending(void)
 
 /* Given the mask, find the first available signal that should be serviced. */
 
+#define SYNCHRONOUS_MASK \
+	(sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
+	 sigmask(SIGTRAP) | sigmask(SIGFPE))
+
 int next_signal(struct sigpending *pending, sigset_t *mask)
 {
 	unsigned long i, *s, *m, x;
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
 
 	s = pending->signal.sig;
 	m = mask->sig;
+
+	/*
+	 * Handle the first word specially: it contains the
+	 * synchronous signals that need to be dequeued first.
+	 */
+	x = *s &~ *m;
+	if (x) {
+		if (x & SYNCHRONOUS_MASK)
+			x &= SYNCHRONOUS_MASK;
+		sig = ffz(~x) + 1;
+		return sig;
+	}
+
 	switch (_NSIG_WORDS) {
 	default:
-		for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m)
-			if ((x = *s &~ *m) != 0) {
-				sig = ffz(~x) + i*_NSIG_BPW + 1;
-				break;
-			}
+		for (i = 1; i < _NSIG_WORDS; ++i) {
+			x = *++s &~ *++m;
+			if (!x)
+				continue;
+			sig = ffz(~x) + i*_NSIG_BPW + 1;
+			break;
+		}
 		break;
 
-	case 2: if ((x = s[0] &~ m[0]) != 0)
-			sig = 1;
-		else if ((x = s[1] &~ m[1]) != 0)
-			sig = _NSIG_BPW + 1;
-		else
+	case 2:
+		x = s[1] &~ m[1];
+		if (!x)
 			break;
-		sig += ffz(~x);
+		sig = ffz(~x) + _NSIG_BPW + 1;
 		break;
 
-	case 1: if ((x = *s &~ *m) != 0)
-			sig = ffz(~x) + 1;
+	case 1:
+		/* Nothing to do */
 		break;
 	}
 
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
-			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
+			task_rlimit(t, RLIMIT_SIGPENDING)) {
 		q = kmem_cache_alloc(sigqueue_cachep, flags);
 	} else {
 		print_dropped_signal(sig);
@@ -620,12 +637,12 @@ static inline bool si_fromuser(const struct siginfo *info)
 
 /*
  * Bad permissions for sending the signal
- * - the caller must hold at least the RCU read lock
+ * - the caller must hold the RCU read lock
  */
 static int check_kill_permission(int sig, struct siginfo *info,
 				 struct task_struct *t)
 {
-	const struct cred *cred = current_cred(), *tcred;
+	const struct cred *cred, *tcred;
 	struct pid *sid;
 	int error;
 
@@ -639,8 +656,10 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (error)
 		return error;
 
+	cred = current_cred();
 	tcred = __task_cred(t);
-	if ((cred->euid ^ tcred->suid) &&
+	if (!same_thread_group(current, t) &&
+	    (cred->euid ^ tcred->suid) &&
 	    (cred->euid ^ tcred->uid) &&
 	    (cred->uid  ^ tcred->suid) &&
 	    (cred->uid  ^ tcred->uid) &&
@@ -1066,23 +1085,24 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 /*
  * Nuke all other threads in the group.
  */
-void zap_other_threads(struct task_struct *p)
+int zap_other_threads(struct task_struct *p)
 {
-	struct task_struct *t;
+	struct task_struct *t = p;
+	int count = 0;
 
 	p->signal->group_stop_count = 0;
 
-	for (t = next_thread(p); t != p; t = next_thread(t)) {
-		/*
-		 * Don't bother with already dead threads
-		 */
+	while_each_thread(p, t) {
+		count++;
+
+		/* Don't bother with already dead threads */
 		if (t->exit_state)
 			continue;
-
-		/* SIGKILL will be handled before any pending SIGSTOP */
 		sigaddset(&t->pending.signal, SIGKILL);
 		signal_wake_up(t, 1);
 	}
+
+	return count;
 }
 
 struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
@@ -1107,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
 
 /*
  * send signal info to all the members of a group
- * - the caller must hold the RCU read lock at least
  */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
-	int ret = check_kill_permission(sig, info, p);
+	int ret;
+
+	rcu_read_lock();
+	ret = check_kill_permission(sig, info, p);
+	rcu_read_unlock();
 
 	if (!ret && sig)
 		ret = do_send_sig_info(sig, info, p, true);
@@ -2192,6 +2215,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 #ifdef __ARCH_SI_TRAPNO
 		err |= __put_user(from->si_trapno, &to->si_trapno);
 #endif
+#ifdef BUS_MCEERR_AO
+		/* 
+		 * Other callers might not initialize the si_lsb field,
+	 	 * so check explicitely for the right codes here.
+		 */
+		if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
+			err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
+#endif
 		break;
 	case __SI_CHLD:
 		err |= __put_user(from->si_pid, &to->si_pid);
@@ -2718,3 +2749,43 @@ void __init signals_init(void)
 {
 	sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
 }
+
+#ifdef CONFIG_KGDB_KDB
+#include <linux/kdb.h>
+/*
+ * kdb_send_sig_info - Allows kdb to send signals without exposing
+ * signal internals.  This function checks if the required locks are
+ * available before calling the main signal code, to avoid kdb
+ * deadlocks.
+ */
+void
+kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
+{
+	static struct task_struct *kdb_prev_t;
+	int sig, new_t;
+	if (!spin_trylock(&t->sighand->siglock)) {
+		kdb_printf("Can't do kill command now.\n"
+			   "The sigmask lock is held somewhere else in "
+			   "kernel, try again later\n");
+		return;
+	}
+	spin_unlock(&t->sighand->siglock);
+	new_t = kdb_prev_t != t;
+	kdb_prev_t = t;
+	if (t->state != TASK_RUNNING && new_t) {
+		kdb_printf("Process is not RUNNING, sending a signal from "
+			   "kdb risks deadlock\n"
+			   "on the run queue locks. "
+			   "The signal has _not_ been sent.\n"
+			   "Reissue the kill command if you want to risk "
+			   "the deadlock.\n");
+		return;
+	}
+	sig = info->si_signo;
+	if (send_sig_info(sig, info, t))
+		kdb_printf("Fail to deliver Signal %d to process %d.\n",
+			   sig, t->pid);
+	else
+		kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
+}
+#endif	/* CONFIG_KGDB_KDB */
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
deleted file mode 100644
index e45c4364529..00000000000
--- a/kernel/slow-work-debugfs.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Slow work debugging
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/seq_file.h>
-#include "slow-work.h"
-
-#define ITERATOR_SHIFT		(BITS_PER_LONG - 4)
-#define ITERATOR_SELECTOR	(0xfUL << ITERATOR_SHIFT)
-#define ITERATOR_COUNTER	(~ITERATOR_SELECTOR)
-
-void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
-{
-	seq_puts(m, "Slow-work: New thread");
-}
-
-/*
- * Render the time mark field on a work item into a 5-char time with units plus
- * a space
- */
-static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
-{
-	struct timespec now, diff;
-
-	now = CURRENT_TIME;
-	diff = timespec_sub(now, work->mark);
-
-	if (diff.tv_sec < 0)
-		seq_puts(m, "  -ve ");
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
-		seq_printf(m, "%3luns ", diff.tv_nsec);
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
-		seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
-		seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
-	else if (diff.tv_sec <= 1)
-		seq_puts(m, "   1s ");
-	else if (diff.tv_sec < 60)
-		seq_printf(m, "%4lus ", diff.tv_sec);
-	else if (diff.tv_sec < 60 * 60)
-		seq_printf(m, "%4lum ", diff.tv_sec / 60);
-	else if (diff.tv_sec < 60 * 60 * 24)
-		seq_printf(m, "%4luh ", diff.tv_sec / 3600);
-	else
-		seq_puts(m, "exces ");
-}
-
-/*
- * Describe a slow work item for debugfs
- */
-static int slow_work_runqueue_show(struct seq_file *m, void *v)
-{
-	struct slow_work *work;
-	struct list_head *p = v;
-	unsigned long id;
-
-	switch ((unsigned long) v) {
-	case 1:
-		seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
-		return 0;
-	case 2:
-		seq_puts(m, "=== ===== ================ == ===== ==========\n");
-		return 0;
-
-	case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
-		id = (unsigned long) v - 3;
-
-		read_lock(&slow_work_execs_lock);
-		work = slow_work_execs[id];
-		if (work) {
-			smp_read_barrier_depends();
-
-			seq_printf(m, "%3lu %5d %16p %2lx ",
-				   id, slow_work_pids[id], work, work->flags);
-			slow_work_print_mark(m, work);
-
-			if (work->ops->desc)
-				work->ops->desc(work, m);
-			seq_putc(m, '\n');
-		}
-		read_unlock(&slow_work_execs_lock);
-		return 0;
-
-	default:
-		work = list_entry(p, struct slow_work, link);
-		seq_printf(m, "%3s     - %16p %2lx ",
-			   work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
-			   work, work->flags);
-		slow_work_print_mark(m, work);
-
-		if (work->ops->desc)
-			work->ops->desc(work, m);
-		seq_putc(m, '\n');
-		return 0;
-	}
-}
-
-/*
- * map the iterator to a work item
- */
-static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
-{
-	struct list_head *p;
-	unsigned long count, id;
-
-	switch (*_pos >> ITERATOR_SHIFT) {
-	case 0x0:
-		if (*_pos == 0)
-			*_pos = 1;
-		if (*_pos < 3)
-			return (void *)(unsigned long) *_pos;
-		if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
-			for (id = *_pos - 3;
-			     id < SLOW_WORK_THREAD_LIMIT;
-			     id++, (*_pos)++)
-				if (slow_work_execs[id])
-					return (void *)(unsigned long) *_pos;
-		*_pos = 0x1UL << ITERATOR_SHIFT;
-
-	case 0x1:
-		count = *_pos & ITERATOR_COUNTER;
-		list_for_each(p, &slow_work_queue) {
-			if (count == 0)
-				return p;
-			count--;
-		}
-		*_pos = 0x2UL << ITERATOR_SHIFT;
-
-	case 0x2:
-		count = *_pos & ITERATOR_COUNTER;
-		list_for_each(p, &vslow_work_queue) {
-			if (count == 0)
-				return p;
-			count--;
-		}
-		*_pos = 0x3UL << ITERATOR_SHIFT;
-
-	default:
-		return NULL;
-	}
-}
-
-/*
- * set up the iterator to start reading from the first line
- */
-static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
-{
-	spin_lock_irq(&slow_work_queue_lock);
-	return slow_work_runqueue_index(m, _pos);
-}
-
-/*
- * move to the next line
- */
-static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
-{
-	struct list_head *p = v;
-	unsigned long selector = *_pos >> ITERATOR_SHIFT;
-
-	(*_pos)++;
-	switch (selector) {
-	case 0x0:
-		return slow_work_runqueue_index(m, _pos);
-
-	case 0x1:
-		if (*_pos >> ITERATOR_SHIFT == 0x1) {
-			p = p->next;
-			if (p != &slow_work_queue)
-				return p;
-		}
-		*_pos = 0x2UL << ITERATOR_SHIFT;
-		p = &vslow_work_queue;
-
-	case 0x2:
-		if (*_pos >> ITERATOR_SHIFT == 0x2) {
-			p = p->next;
-			if (p != &vslow_work_queue)
-				return p;
-		}
-		*_pos = 0x3UL << ITERATOR_SHIFT;
-
-	default:
-		return NULL;
-	}
-}
-
-/*
- * clean up after reading
- */
-static void slow_work_runqueue_stop(struct seq_file *m, void *v)
-{
-	spin_unlock_irq(&slow_work_queue_lock);
-}
-
-static const struct seq_operations slow_work_runqueue_ops = {
-	.start		= slow_work_runqueue_start,
-	.stop		= slow_work_runqueue_stop,
-	.next		= slow_work_runqueue_next,
-	.show		= slow_work_runqueue_show,
-};
-
-/*
- * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
- */
-static int slow_work_runqueue_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &slow_work_runqueue_ops);
-}
-
-const struct file_operations slow_work_runqueue_fops = {
-	.owner		= THIS_MODULE,
-	.open		= slow_work_runqueue_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
deleted file mode 100644
index 7494bbf5a27..00000000000
--- a/kernel/slow-work.c
+++ /dev/null
@@ -1,1068 +0,0 @@
-/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- *
- * See Documentation/slow-work.txt
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
-#include <linux/wait.h>
-#include <linux/debugfs.h>
-#include "slow-work.h"
-
-static void slow_work_cull_timeout(unsigned long);
-static void slow_work_oom_timeout(unsigned long);
-
-#ifdef CONFIG_SYSCTL
-static int slow_work_min_threads_sysctl(struct ctl_table *, int,
-					void __user *, size_t *, loff_t *);
-
-static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
-					void __user *, size_t *, loff_t *);
-#endif
-
-/*
- * The pool of threads has at least min threads in it as long as someone is
- * using the facility, and may have as many as max.
- *
- * A portion of the pool may be processing very slow operations.
- */
-static unsigned slow_work_min_threads = 2;
-static unsigned slow_work_max_threads = 4;
-static unsigned vslow_work_proportion = 50; /* % of threads that may process
-					     * very slow work */
-
-#ifdef CONFIG_SYSCTL
-static const int slow_work_min_min_threads = 2;
-static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
-static const int slow_work_min_vslow = 1;
-static const int slow_work_max_vslow = 99;
-
-ctl_table slow_work_sysctls[] = {
-	{
-		.procname	= "min-threads",
-		.data		= &slow_work_min_threads,
-		.maxlen		= sizeof(unsigned),
-		.mode		= 0644,
-		.proc_handler	= slow_work_min_threads_sysctl,
-		.extra1		= (void *) &slow_work_min_min_threads,
-		.extra2		= &slow_work_max_threads,
-	},
-	{
-		.procname	= "max-threads",
-		.data		= &slow_work_max_threads,
-		.maxlen		= sizeof(unsigned),
-		.mode		= 0644,
-		.proc_handler	= slow_work_max_threads_sysctl,
-		.extra1		= &slow_work_min_threads,
-		.extra2		= (void *) &slow_work_max_max_threads,
-	},
-	{
-		.procname	= "vslow-percentage",
-		.data		= &vslow_work_proportion,
-		.maxlen		= sizeof(unsigned),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *) &slow_work_min_vslow,
-		.extra2		= (void *) &slow_work_max_vslow,
-	},
-	{}
-};
-#endif
-
-/*
- * The active state of the thread pool
- */
-static atomic_t slow_work_thread_count;
-static atomic_t vslow_work_executing_count;
-
-static bool slow_work_may_not_start_new_thread;
-static bool slow_work_cull; /* cull a thread due to lack of activity */
-static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
-static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
-static struct slow_work slow_work_new_thread; /* new thread starter */
-
-/*
- * slow work ID allocation (use slow_work_queue_lock)
- */
-static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
-
-/*
- * Unregistration tracking to prevent put_ref() from disappearing during module
- * unload
- */
-#ifdef CONFIG_MODULES
-static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
-static struct module *slow_work_unreg_module;
-static struct slow_work *slow_work_unreg_work_item;
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
-static DEFINE_MUTEX(slow_work_unreg_sync_lock);
-
-static void slow_work_set_thread_processing(int id, struct slow_work *work)
-{
-	if (work)
-		slow_work_thread_processing[id] = work->owner;
-}
-static void slow_work_done_thread_processing(int id, struct slow_work *work)
-{
-	struct module *module = slow_work_thread_processing[id];
-
-	slow_work_thread_processing[id] = NULL;
-	smp_mb();
-	if (slow_work_unreg_work_item == work ||
-	    slow_work_unreg_module == module)
-		wake_up_all(&slow_work_unreg_wq);
-}
-static void slow_work_clear_thread_processing(int id)
-{
-	slow_work_thread_processing[id] = NULL;
-}
-#else
-static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
-static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
-static void slow_work_clear_thread_processing(int id) {}
-#endif
-
-/*
- * Data for tracking currently executing items for indication through /proc
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
-pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
-DEFINE_RWLOCK(slow_work_execs_lock);
-#endif
-
-/*
- * The queues of work items and the lock governing access to them.  These are
- * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
- * as the number of threads bears no relation to the number of CPUs.
- *
- * There are two queues of work items: one for slow work items, and one for
- * very slow work items.
- */
-LIST_HEAD(slow_work_queue);
-LIST_HEAD(vslow_work_queue);
-DEFINE_SPINLOCK(slow_work_queue_lock);
-
-/*
- * The following are two wait queues that get pinged when a work item is placed
- * on an empty queue.  These allow work items that are hogging a thread by
- * sleeping in a way that could be deferred to yield their thread and enqueue
- * themselves.
- */
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
-static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
-
-/*
- * The thread controls.  A variable used to signal to the threads that they
- * should exit when the queue is empty, a waitqueue used by the threads to wait
- * for signals, and a completion set by the last thread to exit.
- */
-static bool slow_work_threads_should_exit;
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
-static DECLARE_COMPLETION(slow_work_last_thread_exited);
-
-/*
- * The number of users of the thread pool and its lock.  Whilst this is zero we
- * have no threads hanging around, and when this reaches zero, we wait for all
- * active or queued work items to complete and kill all the threads we do have.
- */
-static int slow_work_user_count;
-static DEFINE_MUTEX(slow_work_user_lock);
-
-static inline int slow_work_get_ref(struct slow_work *work)
-{
-	if (work->ops->get_ref)
-		return work->ops->get_ref(work);
-
-	return 0;
-}
-
-static inline void slow_work_put_ref(struct slow_work *work)
-{
-	if (work->ops->put_ref)
-		work->ops->put_ref(work);
-}
-
-/*
- * Calculate the maximum number of active threads in the pool that are
- * permitted to process very slow work items.
- *
- * The answer is rounded up to at least 1, but may not equal or exceed the
- * maximum number of the threads in the pool.  This means we always have at
- * least one thread that can process slow work items, and we always have at
- * least one thread that won't get tied up doing so.
- */
-static unsigned slow_work_calc_vsmax(void)
-{
-	unsigned vsmax;
-
-	vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
-	vsmax /= 100;
-	vsmax = max(vsmax, 1U);
-	return min(vsmax, slow_work_max_threads - 1);
-}
-
-/*
- * Attempt to execute stuff queued on a slow thread.  Return true if we managed
- * it, false if there was nothing to do.
- */
-static noinline bool slow_work_execute(int id)
-{
-	struct slow_work *work = NULL;
-	unsigned vsmax;
-	bool very_slow;
-
-	vsmax = slow_work_calc_vsmax();
-
-	/* see if we can schedule a new thread to be started if we're not
-	 * keeping up with the work */
-	if (!waitqueue_active(&slow_work_thread_wq) &&
-	    (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
-	    atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
-	    !slow_work_may_not_start_new_thread)
-		slow_work_enqueue(&slow_work_new_thread);
-
-	/* find something to execute */
-	spin_lock_irq(&slow_work_queue_lock);
-	if (!list_empty(&vslow_work_queue) &&
-	    atomic_read(&vslow_work_executing_count) < vsmax) {
-		work = list_entry(vslow_work_queue.next,
-				  struct slow_work, link);
-		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
-			BUG();
-		list_del_init(&work->link);
-		atomic_inc(&vslow_work_executing_count);
-		very_slow = true;
-	} else if (!list_empty(&slow_work_queue)) {
-		work = list_entry(slow_work_queue.next,
-				  struct slow_work, link);
-		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
-			BUG();
-		list_del_init(&work->link);
-		very_slow = false;
-	} else {
-		very_slow = false; /* avoid the compiler warning */
-	}
-
-	slow_work_set_thread_processing(id, work);
-	if (work) {
-		slow_work_mark_time(work);
-		slow_work_begin_exec(id, work);
-	}
-
-	spin_unlock_irq(&slow_work_queue_lock);
-
-	if (!work)
-		return false;
-
-	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
-		BUG();
-
-	/* don't execute if the work is in the process of being cancelled */
-	if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
-		work->ops->execute(work);
-
-	if (very_slow)
-		atomic_dec(&vslow_work_executing_count);
-	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
-
-	/* wake up anyone waiting for this work to be complete */
-	wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
-
-	slow_work_end_exec(id, work);
-
-	/* if someone tried to enqueue the item whilst we were executing it,
-	 * then it'll be left unenqueued to avoid multiple threads trying to
-	 * execute it simultaneously
-	 *
-	 * there is, however, a race between us testing the pending flag and
-	 * getting the spinlock, and between the enqueuer setting the pending
-	 * flag and getting the spinlock, so we use a deferral bit to tell us
-	 * if the enqueuer got there first
-	 */
-	if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
-		spin_lock_irq(&slow_work_queue_lock);
-
-		if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
-		    test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
-			goto auto_requeue;
-
-		spin_unlock_irq(&slow_work_queue_lock);
-	}
-
-	/* sort out the race between module unloading and put_ref() */
-	slow_work_put_ref(work);
-	slow_work_done_thread_processing(id, work);
-
-	return true;
-
-auto_requeue:
-	/* we must complete the enqueue operation
-	 * - we transfer our ref on the item back to the appropriate queue
-	 * - don't wake another thread up as we're awake already
-	 */
-	slow_work_mark_time(work);
-	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
-		list_add_tail(&work->link, &vslow_work_queue);
-	else
-		list_add_tail(&work->link, &slow_work_queue);
-	spin_unlock_irq(&slow_work_queue_lock);
-	slow_work_clear_thread_processing(id);
-	return true;
-}
-
-/**
- * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
- * work: The work item under execution that wants to sleep
- * _timeout: Scheduler sleep timeout
- *
- * Allow a requeueable work item to sleep on a slow-work processor thread until
- * that thread is needed to do some other work or the sleep is interrupted by
- * some other event.
- *
- * The caller must set up a wake up event before calling this and must have set
- * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
- * condition before calling this function as no test is made here.
- *
- * False is returned if there is nothing on the queue; true is returned if the
- * work item should be requeued
- */
-bool slow_work_sleep_till_thread_needed(struct slow_work *work,
-					signed long *_timeout)
-{
-	wait_queue_head_t *wfo_wq;
-	struct list_head *queue;
-
-	DEFINE_WAIT(wait);
-
-	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
-		wfo_wq = &vslow_work_queue_waits_for_occupation;
-		queue = &vslow_work_queue;
-	} else {
-		wfo_wq = &slow_work_queue_waits_for_occupation;
-		queue = &slow_work_queue;
-	}
-
-	if (!list_empty(queue))
-		return true;
-
-	add_wait_queue_exclusive(wfo_wq, &wait);
-	if (list_empty(queue))
-		*_timeout = schedule_timeout(*_timeout);
-	finish_wait(wfo_wq, &wait);
-
-	return !list_empty(queue);
-}
-EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
-
-/**
- * slow_work_enqueue - Schedule a slow work item for processing
- * @work: The work item to queue
- *
- * Schedule a slow work item for processing.  If the item is already undergoing
- * execution, this guarantees not to re-enter the execution routine until the
- * first execution finishes.
- *
- * The item is pinned by this function as it retains a reference to it, managed
- * through the item operations.  The item is unpinned once it has been
- * executed.
- *
- * An item may hog the thread that is running it for a relatively large amount
- * of time, sufficient, for example, to perform several lookup, mkdir, create
- * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks.
- *
- * Conversely, if a number of items are awaiting processing, it may take some
- * time before any given item is given attention.  The number of threads in the
- * pool may be increased to deal with demand, but only up to a limit.
- *
- * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
- * the very slow queue, from which only a portion of the threads will be
- * allowed to pick items to execute.  This ensures that very slow items won't
- * overly block ones that are just ordinarily slow.
- *
- * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
- * attempted queued)
- */
-int slow_work_enqueue(struct slow_work *work)
-{
-	wait_queue_head_t *wfo_wq;
-	struct list_head *queue;
-	unsigned long flags;
-	int ret;
-
-	if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
-		return -ECANCELED;
-
-	BUG_ON(slow_work_user_count <= 0);
-	BUG_ON(!work);
-	BUG_ON(!work->ops);
-
-	/* when honouring an enqueue request, we only promise that we will run
-	 * the work function in the future; we do not promise to run it once
-	 * per enqueue request
-	 *
-	 * we use the PENDING bit to merge together repeat requests without
-	 * having to disable IRQs and take the spinlock, whilst still
-	 * maintaining our promise
-	 */
-	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
-		if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
-			wfo_wq = &vslow_work_queue_waits_for_occupation;
-			queue = &vslow_work_queue;
-		} else {
-			wfo_wq = &slow_work_queue_waits_for_occupation;
-			queue = &slow_work_queue;
-		}
-
-		spin_lock_irqsave(&slow_work_queue_lock, flags);
-
-		if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
-			goto cancelled;
-
-		/* we promise that we will not attempt to execute the work
-		 * function in more than one thread simultaneously
-		 *
-		 * this, however, leaves us with a problem if we're asked to
-		 * enqueue the work whilst someone is executing the work
-		 * function as simply queueing the work immediately means that
-		 * another thread may try executing it whilst it is already
-		 * under execution
-		 *
-		 * to deal with this, we set the ENQ_DEFERRED bit instead of
-		 * enqueueing, and the thread currently executing the work
-		 * function will enqueue the work item when the work function
-		 * returns and it has cleared the EXECUTING bit
-		 */
-		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
-			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
-		} else {
-			ret = slow_work_get_ref(work);
-			if (ret < 0)
-				goto failed;
-			slow_work_mark_time(work);
-			list_add_tail(&work->link, queue);
-			wake_up(&slow_work_thread_wq);
-
-			/* if someone who could be requeued is sleeping on a
-			 * thread, then ask them to yield their thread */
-			if (work->link.prev == queue)
-				wake_up(wfo_wq);
-		}
-
-		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	}
-	return 0;
-
-cancelled:
-	ret = -ECANCELED;
-failed:
-	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL(slow_work_enqueue);
-
-static int slow_work_wait(void *word)
-{
-	schedule();
-	return 0;
-}
-
-/**
- * slow_work_cancel - Cancel a slow work item
- * @work: The work item to cancel
- *
- * This function will cancel a previously enqueued work item. If we cannot
- * cancel the work item, it is guarenteed to have run when this function
- * returns.
- */
-void slow_work_cancel(struct slow_work *work)
-{
-	bool wait = true, put = false;
-
-	set_bit(SLOW_WORK_CANCELLING, &work->flags);
-	smp_mb();
-
-	/* if the work item is a delayed work item with an active timer, we
-	 * need to wait for the timer to finish _before_ getting the spinlock,
-	 * lest we deadlock against the timer routine
-	 *
-	 * the timer routine will leave DELAYED set if it notices the
-	 * CANCELLING flag in time
-	 */
-	if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
-		struct delayed_slow_work *dwork =
-			container_of(work, struct delayed_slow_work, work);
-		del_timer_sync(&dwork->timer);
-	}
-
-	spin_lock_irq(&slow_work_queue_lock);
-
-	if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
-		/* the timer routine aborted or never happened, so we are left
-		 * holding the timer's reference on the item and should just
-		 * drop the pending flag and wait for any ongoing execution to
-		 * finish */
-		struct delayed_slow_work *dwork =
-			container_of(work, struct delayed_slow_work, work);
-
-		BUG_ON(timer_pending(&dwork->timer));
-		BUG_ON(!list_empty(&work->link));
-
-		clear_bit(SLOW_WORK_DELAYED, &work->flags);
-		put = true;
-		clear_bit(SLOW_WORK_PENDING, &work->flags);
-
-	} else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
-		   !list_empty(&work->link)) {
-		/* the link in the pending queue holds a reference on the item
-		 * that we will need to release */
-		list_del_init(&work->link);
-		wait = false;
-		put = true;
-		clear_bit(SLOW_WORK_PENDING, &work->flags);
-
-	} else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
-		/* the executor is holding our only reference on the item, so
-		 * we merely need to wait for it to finish executing */
-		clear_bit(SLOW_WORK_PENDING, &work->flags);
-	}
-
-	spin_unlock_irq(&slow_work_queue_lock);
-
-	/* the EXECUTING flag is set by the executor whilst the spinlock is set
-	 * and before the item is dequeued - so assuming the above doesn't
-	 * actually dequeue it, simply waiting for the EXECUTING flag to be
-	 * released here should be sufficient */
-	if (wait)
-		wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
-			    TASK_UNINTERRUPTIBLE);
-
-	clear_bit(SLOW_WORK_CANCELLING, &work->flags);
-	if (put)
-		slow_work_put_ref(work);
-}
-EXPORT_SYMBOL(slow_work_cancel);
-
-/*
- * Handle expiry of the delay timer, indicating that a delayed slow work item
- * should now be queued if not cancelled
- */
-static void delayed_slow_work_timer(unsigned long data)
-{
-	wait_queue_head_t *wfo_wq;
-	struct list_head *queue;
-	struct slow_work *work = (struct slow_work *) data;
-	unsigned long flags;
-	bool queued = false, put = false, first = false;
-
-	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
-		wfo_wq = &vslow_work_queue_waits_for_occupation;
-		queue = &vslow_work_queue;
-	} else {
-		wfo_wq = &slow_work_queue_waits_for_occupation;
-		queue = &slow_work_queue;
-	}
-
-	spin_lock_irqsave(&slow_work_queue_lock, flags);
-	if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
-		clear_bit(SLOW_WORK_DELAYED, &work->flags);
-
-		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
-			/* we discard the reference the timer was holding in
-			 * favour of the one the executor holds */
-			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
-			put = true;
-		} else {
-			slow_work_mark_time(work);
-			list_add_tail(&work->link, queue);
-			queued = true;
-			if (work->link.prev == queue)
-				first = true;
-		}
-	}
-
-	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	if (put)
-		slow_work_put_ref(work);
-	if (first)
-		wake_up(wfo_wq);
-	if (queued)
-		wake_up(&slow_work_thread_wq);
-}
-
-/**
- * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
- * @dwork: The delayed work item to queue
- * @delay: When to start executing the work, in jiffies from now
- *
- * This is similar to slow_work_enqueue(), but it adds a delay before the work
- * is actually queued for processing.
- *
- * The item can have delayed processing requested on it whilst it is being
- * executed.  The delay will begin immediately, and if it expires before the
- * item finishes executing, the item will be placed back on the queue when it
- * has done executing.
- */
-int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
-			      unsigned long delay)
-{
-	struct slow_work *work = &dwork->work;
-	unsigned long flags;
-	int ret;
-
-	if (delay == 0)
-		return slow_work_enqueue(&dwork->work);
-
-	BUG_ON(slow_work_user_count <= 0);
-	BUG_ON(!work);
-	BUG_ON(!work->ops);
-
-	if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
-		return -ECANCELED;
-
-	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
-		spin_lock_irqsave(&slow_work_queue_lock, flags);
-
-		if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
-			goto cancelled;
-
-		/* the timer holds a reference whilst it is pending */
-		ret = work->ops->get_ref(work);
-		if (ret < 0)
-			goto cant_get_ref;
-
-		if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
-			BUG();
-		dwork->timer.expires = jiffies + delay;
-		dwork->timer.data = (unsigned long) work;
-		dwork->timer.function = delayed_slow_work_timer;
-		add_timer(&dwork->timer);
-
-		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	}
-
-	return 0;
-
-cancelled:
-	ret = -ECANCELED;
-cant_get_ref:
-	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL(delayed_slow_work_enqueue);
-
-/*
- * Schedule a cull of the thread pool at some time in the near future
- */
-static void slow_work_schedule_cull(void)
-{
-	mod_timer(&slow_work_cull_timer,
-		  round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
-}
-
-/*
- * Worker thread culling algorithm
- */
-static bool slow_work_cull_thread(void)
-{
-	unsigned long flags;
-	bool do_cull = false;
-
-	spin_lock_irqsave(&slow_work_queue_lock, flags);
-
-	if (slow_work_cull) {
-		slow_work_cull = false;
-
-		if (list_empty(&slow_work_queue) &&
-		    list_empty(&vslow_work_queue) &&
-		    atomic_read(&slow_work_thread_count) >
-		    slow_work_min_threads) {
-			slow_work_schedule_cull();
-			do_cull = true;
-		}
-	}
-
-	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	return do_cull;
-}
-
-/*
- * Determine if there is slow work available for dispatch
- */
-static inline bool slow_work_available(int vsmax)
-{
-	return !list_empty(&slow_work_queue) ||
-		(!list_empty(&vslow_work_queue) &&
-		 atomic_read(&vslow_work_executing_count) < vsmax);
-}
-
-/*
- * Worker thread dispatcher
- */
-static int slow_work_thread(void *_data)
-{
-	int vsmax, id;
-
-	DEFINE_WAIT(wait);
-
-	set_freezable();
-	set_user_nice(current, -5);
-
-	/* allocate ourselves an ID */
-	spin_lock_irq(&slow_work_queue_lock);
-	id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
-	BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
-	__set_bit(id, slow_work_ids);
-	slow_work_set_thread_pid(id, current->pid);
-	spin_unlock_irq(&slow_work_queue_lock);
-
-	sprintf(current->comm, "kslowd%03u", id);
-
-	for (;;) {
-		vsmax = vslow_work_proportion;
-		vsmax *= atomic_read(&slow_work_thread_count);
-		vsmax /= 100;
-
-		prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
-					  TASK_INTERRUPTIBLE);
-		if (!freezing(current) &&
-		    !slow_work_threads_should_exit &&
-		    !slow_work_available(vsmax) &&
-		    !slow_work_cull)
-			schedule();
-		finish_wait(&slow_work_thread_wq, &wait);
-
-		try_to_freeze();
-
-		vsmax = vslow_work_proportion;
-		vsmax *= atomic_read(&slow_work_thread_count);
-		vsmax /= 100;
-
-		if (slow_work_available(vsmax) && slow_work_execute(id)) {
-			cond_resched();
-			if (list_empty(&slow_work_queue) &&
-			    list_empty(&vslow_work_queue) &&
-			    atomic_read(&slow_work_thread_count) >
-			    slow_work_min_threads)
-				slow_work_schedule_cull();
-			continue;
-		}
-
-		if (slow_work_threads_should_exit)
-			break;
-
-		if (slow_work_cull && slow_work_cull_thread())
-			break;
-	}
-
-	spin_lock_irq(&slow_work_queue_lock);
-	slow_work_set_thread_pid(id, 0);
-	__clear_bit(id, slow_work_ids);
-	spin_unlock_irq(&slow_work_queue_lock);
-
-	if (atomic_dec_and_test(&slow_work_thread_count))
-		complete_and_exit(&slow_work_last_thread_exited, 0);
-	return 0;
-}
-
-/*
- * Handle thread cull timer expiration
- */
-static void slow_work_cull_timeout(unsigned long data)
-{
-	slow_work_cull = true;
-	wake_up(&slow_work_thread_wq);
-}
-
-/*
- * Start a new slow work thread
- */
-static void slow_work_new_thread_execute(struct slow_work *work)
-{
-	struct task_struct *p;
-
-	if (slow_work_threads_should_exit)
-		return;
-
-	if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
-		return;
-
-	if (!mutex_trylock(&slow_work_user_lock))
-		return;
-
-	slow_work_may_not_start_new_thread = true;
-	atomic_inc(&slow_work_thread_count);
-	p = kthread_run(slow_work_thread, NULL, "kslowd");
-	if (IS_ERR(p)) {
-		printk(KERN_DEBUG "Slow work thread pool: OOM\n");
-		if (atomic_dec_and_test(&slow_work_thread_count))
-			BUG(); /* we're running on a slow work thread... */
-		mod_timer(&slow_work_oom_timer,
-			  round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
-	} else {
-		/* ratelimit the starting of new threads */
-		mod_timer(&slow_work_oom_timer, jiffies + 1);
-	}
-
-	mutex_unlock(&slow_work_user_lock);
-}
-
-static const struct slow_work_ops slow_work_new_thread_ops = {
-	.owner		= THIS_MODULE,
-	.execute	= slow_work_new_thread_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	.desc		= slow_work_new_thread_desc,
-#endif
-};
-
-/*
- * post-OOM new thread start suppression expiration
- */
-static void slow_work_oom_timeout(unsigned long data)
-{
-	slow_work_may_not_start_new_thread = false;
-}
-
-#ifdef CONFIG_SYSCTL
-/*
- * Handle adjustment of the minimum number of threads
- */
-static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
-					void __user *buffer,
-					size_t *lenp, loff_t *ppos)
-{
-	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	int n;
-
-	if (ret == 0) {
-		mutex_lock(&slow_work_user_lock);
-		if (slow_work_user_count > 0) {
-			/* see if we need to start or stop threads */
-			n = atomic_read(&slow_work_thread_count) -
-				slow_work_min_threads;
-
-			if (n < 0 && !slow_work_may_not_start_new_thread)
-				slow_work_enqueue(&slow_work_new_thread);
-			else if (n > 0)
-				slow_work_schedule_cull();
-		}
-		mutex_unlock(&slow_work_user_lock);
-	}
-
-	return ret;
-}
-
-/*
- * Handle adjustment of the maximum number of threads
- */
-static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
-					void __user *buffer,
-					size_t *lenp, loff_t *ppos)
-{
-	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	int n;
-
-	if (ret == 0) {
-		mutex_lock(&slow_work_user_lock);
-		if (slow_work_user_count > 0) {
-			/* see if we need to stop threads */
-			n = slow_work_max_threads -
-				atomic_read(&slow_work_thread_count);
-
-			if (n < 0)
-				slow_work_schedule_cull();
-		}
-		mutex_unlock(&slow_work_user_lock);
-	}
-
-	return ret;
-}
-#endif /* CONFIG_SYSCTL */
-
-/**
- * slow_work_register_user - Register a user of the facility
- * @module: The module about to make use of the facility
- *
- * Register a user of the facility, starting up the initial threads if there
- * aren't any other users at this point.  This will return 0 if successful, or
- * an error if not.
- */
-int slow_work_register_user(struct module *module)
-{
-	struct task_struct *p;
-	int loop;
-
-	mutex_lock(&slow_work_user_lock);
-
-	if (slow_work_user_count == 0) {
-		printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
-		init_completion(&slow_work_last_thread_exited);
-
-		slow_work_threads_should_exit = false;
-		slow_work_init(&slow_work_new_thread,
-			       &slow_work_new_thread_ops);
-		slow_work_may_not_start_new_thread = false;
-		slow_work_cull = false;
-
-		/* start the minimum number of threads */
-		for (loop = 0; loop < slow_work_min_threads; loop++) {
-			atomic_inc(&slow_work_thread_count);
-			p = kthread_run(slow_work_thread, NULL, "kslowd");
-			if (IS_ERR(p))
-				goto error;
-		}
-		printk(KERN_NOTICE "Slow work thread pool: Ready\n");
-	}
-
-	slow_work_user_count++;
-	mutex_unlock(&slow_work_user_lock);
-	return 0;
-
-error:
-	if (atomic_dec_and_test(&slow_work_thread_count))
-		complete(&slow_work_last_thread_exited);
-	if (loop > 0) {
-		printk(KERN_ERR "Slow work thread pool:"
-		       " Aborting startup on ENOMEM\n");
-		slow_work_threads_should_exit = true;
-		wake_up_all(&slow_work_thread_wq);
-		wait_for_completion(&slow_work_last_thread_exited);
-		printk(KERN_ERR "Slow work thread pool: Aborted\n");
-	}
-	mutex_unlock(&slow_work_user_lock);
-	return PTR_ERR(p);
-}
-EXPORT_SYMBOL(slow_work_register_user);
-
-/*
- * wait for all outstanding items from the calling module to complete
- * - note that more items may be queued whilst we're waiting
- */
-static void slow_work_wait_for_items(struct module *module)
-{
-#ifdef CONFIG_MODULES
-	DECLARE_WAITQUEUE(myself, current);
-	struct slow_work *work;
-	int loop;
-
-	mutex_lock(&slow_work_unreg_sync_lock);
-	add_wait_queue(&slow_work_unreg_wq, &myself);
-
-	for (;;) {
-		spin_lock_irq(&slow_work_queue_lock);
-
-		/* first of all, we wait for the last queued item in each list
-		 * to be processed */
-		list_for_each_entry_reverse(work, &vslow_work_queue, link) {
-			if (work->owner == module) {
-				set_current_state(TASK_UNINTERRUPTIBLE);
-				slow_work_unreg_work_item = work;
-				goto do_wait;
-			}
-		}
-		list_for_each_entry_reverse(work, &slow_work_queue, link) {
-			if (work->owner == module) {
-				set_current_state(TASK_UNINTERRUPTIBLE);
-				slow_work_unreg_work_item = work;
-				goto do_wait;
-			}
-		}
-
-		/* then we wait for the items being processed to finish */
-		slow_work_unreg_module = module;
-		smp_mb();
-		for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
-			if (slow_work_thread_processing[loop] == module)
-				goto do_wait;
-		}
-		spin_unlock_irq(&slow_work_queue_lock);
-		break; /* okay, we're done */
-
-	do_wait:
-		spin_unlock_irq(&slow_work_queue_lock);
-		schedule();
-		slow_work_unreg_work_item = NULL;
-		slow_work_unreg_module = NULL;
-	}
-
-	remove_wait_queue(&slow_work_unreg_wq, &myself);
-	mutex_unlock(&slow_work_unreg_sync_lock);
-#endif /* CONFIG_MODULES */
-}
-
-/**
- * slow_work_unregister_user - Unregister a user of the facility
- * @module: The module whose items should be cleared
- *
- * Unregister a user of the facility, killing all the threads if this was the
- * last one.
- *
- * This waits for all the work items belonging to the nominated module to go
- * away before proceeding.
- */
-void slow_work_unregister_user(struct module *module)
-{
-	/* first of all, wait for all outstanding items from the calling module
-	 * to complete */
-	if (module)
-		slow_work_wait_for_items(module);
-
-	/* then we can actually go about shutting down the facility if need
-	 * be */
-	mutex_lock(&slow_work_user_lock);
-
-	BUG_ON(slow_work_user_count <= 0);
-
-	slow_work_user_count--;
-	if (slow_work_user_count == 0) {
-		printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
-		slow_work_threads_should_exit = true;
-		del_timer_sync(&slow_work_cull_timer);
-		del_timer_sync(&slow_work_oom_timer);
-		wake_up_all(&slow_work_thread_wq);
-		wait_for_completion(&slow_work_last_thread_exited);
-		printk(KERN_NOTICE "Slow work thread pool:"
-		       " Shut down complete\n");
-	}
-
-	mutex_unlock(&slow_work_user_lock);
-}
-EXPORT_SYMBOL(slow_work_unregister_user);
-
-/*
- * Initialise the slow work facility
- */
-static int __init init_slow_work(void)
-{
-	unsigned nr_cpus = num_possible_cpus();
-
-	if (slow_work_max_threads < nr_cpus)
-		slow_work_max_threads = nr_cpus;
-#ifdef CONFIG_SYSCTL
-	if (slow_work_max_max_threads < nr_cpus * 2)
-		slow_work_max_max_threads = nr_cpus * 2;
-#endif
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	{
-		struct dentry *dbdir;
-
-		dbdir = debugfs_create_dir("slow_work", NULL);
-		if (dbdir && !IS_ERR(dbdir))
-			debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
-					    NULL, &slow_work_runqueue_fops);
-	}
-#endif
-	return 0;
-}
-
-subsys_initcall(init_slow_work);
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
deleted file mode 100644
index 321f3c59d73..00000000000
--- a/kernel/slow-work.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Slow work private definitions
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
-					 * things to do */
-#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
-					 * OOM */
-
-#define SLOW_WORK_THREAD_LIMIT	255	/* abs maximum number of slow-work threads */
-
-/*
- * slow-work.c
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-extern struct slow_work *slow_work_execs[];
-extern pid_t slow_work_pids[];
-extern rwlock_t slow_work_execs_lock;
-#endif
-
-extern struct list_head slow_work_queue;
-extern struct list_head vslow_work_queue;
-extern spinlock_t slow_work_queue_lock;
-
-/*
- * slow-work-debugfs.c
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-extern const struct file_operations slow_work_runqueue_fops;
-
-extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
-#endif
-
-/*
- * Helper functions
- */
-static inline void slow_work_set_thread_pid(int id, pid_t pid)
-{
-#ifdef CONFIG_SLOW_WORK_PROC
-	slow_work_pids[id] = pid;
-#endif
-}
-
-static inline void slow_work_mark_time(struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_PROC
-	work->mark = CURRENT_TIME;
-#endif
-}
-
-static inline void slow_work_begin_exec(int id, struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_PROC
-	slow_work_execs[id] = work;
-#endif
-}
-
-static inline void slow_work_end_exec(int id, struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_PROC
-	write_lock(&slow_work_execs_lock);
-	slow_work_execs[id] = NULL;
-	write_unlock(&slow_work_execs_lock);
-#endif
-}
diff --git a/kernel/smp.c b/kernel/smp.c
index f1040842244..ed6aacfcb7e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -9,11 +9,10 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
+#include <linux/gfp.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
 
-static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
-
 static struct {
 	struct list_head	queue;
 	raw_spinlock_t		lock;
@@ -33,12 +32,14 @@ struct call_function_data {
 	cpumask_var_t		cpumask;
 };
 
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
+
 struct call_single_queue {
 	struct list_head	list;
 	raw_spinlock_t		lock;
 };
 
-static DEFINE_PER_CPU(struct call_function_data, cfd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
 
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -51,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_UP_PREPARE_FROZEN:
 		if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
 				cpu_to_node(cpu)))
-			return NOTIFY_BAD;
+			return notifier_from_errno(-ENOMEM);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -256,7 +257,7 @@ void generic_smp_call_function_single_interrupt(void)
 	}
 }
 
-static DEFINE_PER_CPU(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
 
 /*
  * smp_call_function_single - Run a function on a specific CPU
@@ -364,9 +365,10 @@ call:
 EXPORT_SYMBOL_GPL(smp_call_function_any);
 
 /**
- * __smp_call_function_single(): Run a function on another CPU
+ * __smp_call_function_single(): Run a function on a specific CPU
  * @cpu: The CPU to run on.
  * @data: Pre-allocated and setup data structure
+ * @wait: If true, wait until function has completed on specified CPU.
  *
  * Like smp_call_function_single(), but allow caller to pass in a
  * pre-allocated data structure. Useful for embedding @data inside
@@ -375,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
 void __smp_call_function_single(int cpu, struct call_single_data *data,
 				int wait)
 {
-	csd_lock(data);
+	unsigned int this_cpu;
+	unsigned long flags;
 
+	this_cpu = get_cpu();
 	/*
 	 * Can deadlock when called with interrupts disabled.
 	 * We allow cpu's that are not yet online though, as no one else can
@@ -386,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 	WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
 		     && !oops_in_progress);
 
-	generic_exec_single(cpu, data, wait);
+	if (cpu == this_cpu) {
+		local_irq_save(flags);
+		data->func(data->info);
+		local_irq_restore(flags);
+	} else {
+		csd_lock(data);
+		generic_exec_single(cpu, data, wait);
+	}
+	put_cpu();
 }
 
 /**
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a09502e2ef7..07b4f1b1a73 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
  */
 
 /*
- * The trampoline is called when the hrtimer expires. If this is
- * called from the hrtimer interrupt then we schedule the tasklet as
- * the timer callback function expects to run in softirq context. If
- * it's called in softirq context anyway (i.e. high resolution timers
- * disabled) then the hrtimer callback is called right away.
+ * The trampoline is called when the hrtimer expires. It schedules a tasklet
+ * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
+ * hrtimer callback, but from softirq context.
  */
 static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
 {
 	struct tasklet_hrtimer *ttimer =
 		container_of(timer, struct tasklet_hrtimer, timer);
 
-	if (hrtimer_is_hres_active(timer)) {
-		tasklet_hi_schedule(&ttimer->tasklet);
-		return HRTIMER_NORESTART;
-	}
-	return ttimer->function(timer);
+	tasklet_hi_schedule(&ttimer->tasklet);
+	return HRTIMER_NORESTART;
 }
 
 /*
@@ -721,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
 			preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
-			rcu_sched_qs((long)__bind_cpu);
+			rcu_note_context_switch((long)__bind_cpu);
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -813,7 +808,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 		p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
 		if (IS_ERR(p)) {
 			printk("ksoftirqd for %i failed\n", hotcpu);
-			return NOTIFY_BAD;
+			return notifier_from_errno(PTR_ERR(p));
 		}
 		kthread_bind(p, hotcpu);
   		per_cpu(ksoftirqd, hotcpu) = p;
@@ -855,7 +850,7 @@ static __init int spawn_ksoftirqd(void)
 	void *cpu = (void *)(long)smp_processor_id();
 	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
 
-	BUG_ON(err == NOTIFY_BAD);
+	BUG_ON(err != NOTIFY_OK);
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
deleted file mode 100644
index d22579087e2..00000000000
--- a/kernel/softlockup.c
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Detect Soft Lockups
- *
- * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
- *
- * this code detects soft lockups: incidents in where on a CPU
- * the kernel does not reschedule for 10 seconds or more.
- */
-#include <linux/mm.h>
-#include <linux/cpu.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/lockdep.h>
-#include <linux/notifier.h>
-#include <linux/module.h>
-#include <linux/sysctl.h>
-
-#include <asm/irq_regs.h>
-
-static DEFINE_SPINLOCK(print_lock);
-
-static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
-static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
-static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
-
-static int __read_mostly did_panic;
-int __read_mostly softlockup_thresh = 60;
-
-/*
- * Should we panic (and reboot, if panic_timeout= is set) when a
- * soft-lockup occurs:
- */
-unsigned int __read_mostly softlockup_panic =
-				CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-
-static int __init softlockup_panic_setup(char *str)
-{
-	softlockup_panic = simple_strtoul(str, NULL, 0);
-
-	return 1;
-}
-__setup("softlockup_panic=", softlockup_panic_setup);
-
-static int
-softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
-{
-	did_panic = 1;
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block panic_block = {
-	.notifier_call = softlock_panic,
-};
-
-/*
- * Returns seconds, approximately.  We don't need nanosecond
- * resolution, and we don't need to waste time with a big divide when
- * 2^30ns == 1.074s.
- */
-static unsigned long get_timestamp(int this_cpu)
-{
-	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
-}
-
-static void __touch_softlockup_watchdog(void)
-{
-	int this_cpu = raw_smp_processor_id();
-
-	__raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
-}
-
-void touch_softlockup_watchdog(void)
-{
-	__raw_get_cpu_var(softlockup_touch_ts) = 0;
-}
-EXPORT_SYMBOL(touch_softlockup_watchdog);
-
-void touch_all_softlockup_watchdogs(void)
-{
-	int cpu;
-
-	/* Cause each CPU to re-update its timestamp rather than complain */
-	for_each_online_cpu(cpu)
-		per_cpu(softlockup_touch_ts, cpu) = 0;
-}
-EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
-
-int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-			     void __user *buffer,
-			     size_t *lenp, loff_t *ppos)
-{
-	touch_all_softlockup_watchdogs();
-	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-}
-
-/*
- * This callback runs from the timer interrupt, and checks
- * whether the watchdog thread has hung or not:
- */
-void softlockup_tick(void)
-{
-	int this_cpu = smp_processor_id();
-	unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
-	unsigned long print_ts;
-	struct pt_regs *regs = get_irq_regs();
-	unsigned long now;
-
-	/* Is detection switched off? */
-	if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
-		/* Be sure we don't false trigger if switched back on */
-		if (touch_ts)
-			per_cpu(softlockup_touch_ts, this_cpu) = 0;
-		return;
-	}
-
-	if (touch_ts == 0) {
-		__touch_softlockup_watchdog();
-		return;
-	}
-
-	print_ts = per_cpu(softlockup_print_ts, this_cpu);
-
-	/* report at most once a second */
-	if (print_ts == touch_ts || did_panic)
-		return;
-
-	/* do not print during early bootup: */
-	if (unlikely(system_state != SYSTEM_RUNNING)) {
-		__touch_softlockup_watchdog();
-		return;
-	}
-
-	now = get_timestamp(this_cpu);
-
-	/*
-	 * Wake up the high-prio watchdog task twice per
-	 * threshold timespan.
-	 */
-	if (now > touch_ts + softlockup_thresh/2)
-		wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
-
-	/* Warn about unreasonable delays: */
-	if (now <= (touch_ts + softlockup_thresh))
-		return;
-
-	per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
-
-	spin_lock(&print_lock);
-	printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
-			this_cpu, now - touch_ts,
-			current->comm, task_pid_nr(current));
-	print_modules();
-	print_irqtrace_events(current);
-	if (regs)
-		show_regs(regs);
-	else
-		dump_stack();
-	spin_unlock(&print_lock);
-
-	if (softlockup_panic)
-		panic("softlockup: hung tasks");
-}
-
-/*
- * The watchdog thread - runs every second and touches the timestamp.
- */
-static int watchdog(void *__bind_cpu)
-{
-	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-
-	sched_setscheduler(current, SCHED_FIFO, &param);
-
-	/* initialize timestamp */
-	__touch_softlockup_watchdog();
-
-	set_current_state(TASK_INTERRUPTIBLE);
-	/*
-	 * Run briefly once per second to reset the softlockup timestamp.
-	 * If this gets delayed for more than 60 seconds then the
-	 * debug-printout triggers in softlockup_tick().
-	 */
-	while (!kthread_should_stop()) {
-		__touch_softlockup_watchdog();
-		schedule();
-
-		if (kthread_should_stop())
-			break;
-
-		set_current_state(TASK_INTERRUPTIBLE);
-	}
-	__set_current_state(TASK_RUNNING);
-
-	return 0;
-}
-
-/*
- * Create/destroy watchdog threads as CPUs come and go:
- */
-static int __cpuinit
-cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-	int hotcpu = (unsigned long)hcpu;
-	struct task_struct *p;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
-		p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
-		if (IS_ERR(p)) {
-			printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
-			return NOTIFY_BAD;
-		}
-		per_cpu(softlockup_touch_ts, hotcpu) = 0;
-		per_cpu(softlockup_watchdog, hotcpu) = p;
-		kthread_bind(p, hotcpu);
-		break;
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
-		break;
-#ifdef CONFIG_HOTPLUG_CPU
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		if (!per_cpu(softlockup_watchdog, hotcpu))
-			break;
-		/* Unbind so it can run.  Fall thru. */
-		kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
-			     cpumask_any(cpu_online_mask));
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		p = per_cpu(softlockup_watchdog, hotcpu);
-		per_cpu(softlockup_watchdog, hotcpu) = NULL;
-		kthread_stop(p);
-		break;
-#endif /* CONFIG_HOTPLUG_CPU */
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata cpu_nfb = {
-	.notifier_call = cpu_callback
-};
-
-static int __initdata nosoftlockup;
-
-static int __init nosoftlockup_setup(char *str)
-{
-	nosoftlockup = 1;
-	return 1;
-}
-__setup("nosoftlockup", nosoftlockup_setup);
-
-static int __init spawn_softlockup_task(void)
-{
-	void *cpu = (void *)(long)smp_processor_id();
-	int err;
-
-	if (nosoftlockup)
-		return 0;
-
-	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-	if (err == NOTIFY_BAD) {
-		BUG();
-		return 1;
-	}
-	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
-	register_cpu_notifier(&cpu_nfb);
-
-	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
-
-	return 0;
-}
-early_initcall(spawn_softlockup_task);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 818d7d9aa03..2980da3fd50 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -30,10 +30,33 @@
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/srcu.h>
 
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+	sp->completed = 0;
+	mutex_init(&sp->mutex);
+	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+	return sp->per_cpu_ref ? 0 : -ENOMEM;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+		       struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/* Don't re-initialize a lock while it is held. */
+	debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+	lockdep_init_map(&sp->dep_map, name, key, 0);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+	return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 /**
  * init_srcu_struct - initialize a sleep-RCU structure
  * @sp: structure to initialize.
@@ -44,13 +67,12 @@
  */
 int init_srcu_struct(struct srcu_struct *sp)
 {
-	sp->completed = 0;
-	mutex_init(&sp->mutex);
-	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
-	return (sp->per_cpu_ref ? 0 : -ENOMEM);
+	return init_srcu_struct_fields(sp);
 }
 EXPORT_SYMBOL_GPL(init_srcu_struct);
 
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 /*
  * srcu_readers_active_idx -- returns approximate number of readers
  *	active on the specified rank of per-CPU counters.
@@ -100,15 +122,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 }
 EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
-/**
- * srcu_read_lock - register a new reader for an SRCU-protected structure.
- * @sp: srcu_struct in which to register the new reader.
- *
+/*
  * Counts the new reader in the appropriate per-CPU element of the
  * srcu_struct.  Must be called from process context.
  * Returns an index that must be passed to the matching srcu_read_unlock().
  */
-int srcu_read_lock(struct srcu_struct *sp)
+int __srcu_read_lock(struct srcu_struct *sp)
 {
 	int idx;
 
@@ -120,31 +139,27 @@ int srcu_read_lock(struct srcu_struct *sp)
 	preempt_enable();
 	return idx;
 }
-EXPORT_SYMBOL_GPL(srcu_read_lock);
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
 
-/**
- * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
- * @sp: srcu_struct in which to unregister the old reader.
- * @idx: return value from corresponding srcu_read_lock().
- *
+/*
  * Removes the count for the old reader from the appropriate per-CPU
  * element of the srcu_struct.  Note that this may well be a different
  * CPU than that which was incremented by the corresponding srcu_read_lock().
  * Must be called from process context.
  */
-void srcu_read_unlock(struct srcu_struct *sp, int idx)
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
 	preempt_disable();
 	srcu_barrier();  /* ensure compiler won't misorder critical section. */
 	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
 	preempt_enable();
 }
-EXPORT_SYMBOL_GPL(srcu_read_unlock);
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
 /*
  * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
-void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
+static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 {
 	int idx;
 
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11..4372ccb2512 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,384 @@
-/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
- * GPL v2 and any later version.
+/*
+ * kernel/stop_machine.c
+ *
+ * Copyright (C) 2008, 2005	IBM Corporation.
+ * Copyright (C) 2008, 2005	Rusty Russell rusty@rustcorp.com.au
+ * Copyright (C) 2010		SUSE Linux Products GmbH
+ * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2 and any later version.
  */
+#include <linux/completion.h>
 #include <linux/cpu.h>
-#include <linux/err.h>
+#include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/stop_machine.h>
-#include <linux/syscalls.h>
 #include <linux/interrupt.h>
+#include <linux/kallsyms.h>
 
 #include <asm/atomic.h>
-#include <asm/uaccess.h>
+
+/*
+ * Structure to determine completion condition and record errors.  May
+ * be shared by works on different cpus.
+ */
+struct cpu_stop_done {
+	atomic_t		nr_todo;	/* nr left to execute */
+	bool			executed;	/* actually executed? */
+	int			ret;		/* collected return value */
+	struct completion	completion;	/* fired if nr_todo reaches 0 */
+};
+
+/* the actual stopper, one per every possible cpu, enabled on online cpus */
+struct cpu_stopper {
+	spinlock_t		lock;
+	bool			enabled;	/* is this stopper enabled? */
+	struct list_head	works;		/* list of pending works */
+	struct task_struct	*thread;	/* stopper thread */
+};
+
+static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+
+static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
+{
+	memset(done, 0, sizeof(*done));
+	atomic_set(&done->nr_todo, nr_todo);
+	init_completion(&done->completion);
+}
+
+/* signal completion unless @done is NULL */
+static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+{
+	if (done) {
+		if (executed)
+			done->executed = true;
+		if (atomic_dec_and_test(&done->nr_todo))
+			complete(&done->completion);
+	}
+}
+
+/* queue @work to @stopper.  if offline, @work is completed immediately */
+static void cpu_stop_queue_work(struct cpu_stopper *stopper,
+				struct cpu_stop_work *work)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&stopper->lock, flags);
+
+	if (stopper->enabled) {
+		list_add_tail(&work->list, &stopper->works);
+		wake_up_process(stopper->thread);
+	} else
+		cpu_stop_signal_done(work->done, false);
+
+	spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+/**
+ * stop_one_cpu - stop a cpu
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on @cpu.  @fn is run in a process context with
+ * the highest priority preempting any task on the cpu and
+ * monopolizing it.  This function returns after the execution is
+ * complete.
+ *
+ * This function doesn't guarantee @cpu stays online till @fn
+ * completes.  If @cpu goes down in the middle, execution may happen
+ * partially or fully on different cpus.  @fn should either be ready
+ * for that or the caller should ensure that @cpu stays online until
+ * this function completes.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
+ * otherwise, the return value of @fn.
+ */
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+	struct cpu_stop_done done;
+	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+
+	cpu_stop_init_done(&done, 1);
+	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+	wait_for_completion(&done.completion);
+	return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_one_cpu_nowait - stop a cpu but don't wait for completion
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Similar to stop_one_cpu() but doesn't wait for completion.  The
+ * caller is responsible for ensuring @work_buf is currently unused
+ * and will remain untouched until stopper starts executing @fn.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+			struct cpu_stop_work *work_buf)
+{
+	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+}
+
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
+
+int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+	struct cpu_stop_work *work;
+	struct cpu_stop_done done;
+	unsigned int cpu;
+
+	/* initialize works and done */
+	for_each_cpu(cpu, cpumask) {
+		work = &per_cpu(stop_cpus_work, cpu);
+		work->fn = fn;
+		work->arg = arg;
+		work->done = &done;
+	}
+	cpu_stop_init_done(&done, cpumask_weight(cpumask));
+
+	/*
+	 * Disable preemption while queueing to avoid getting
+	 * preempted by a stopper which might wait for other stoppers
+	 * to enter @fn which can lead to deadlock.
+	 */
+	preempt_disable();
+	for_each_cpu(cpu, cpumask)
+		cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
+				    &per_cpu(stop_cpus_work, cpu));
+	preempt_enable();
+
+	wait_for_completion(&done.completion);
+	return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_cpus - stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on online cpus in @cpumask.  On each target cpu,
+ * @fn is run in a process context with the highest priority
+ * preempting any task on the cpu and monopolizing it.  This function
+ * returns after all executions are complete.
+ *
+ * This function doesn't guarantee the cpus in @cpumask stay online
+ * till @fn completes.  If some cpus go down in the middle, execution
+ * on the cpu may happen partially or fully on different cpus.  @fn
+ * should either be ready for that or the caller should ensure that
+ * the cpus stay online until this function completes.
+ *
+ * All stop_cpus() calls are serialized making it safe for @fn to wait
+ * for all cpus to start executing it.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed at all because all cpus in
+ * @cpumask were offline; otherwise, 0 if all executions of @fn
+ * returned 0, any non zero return value if any returned non zero.
+ */
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+	int ret;
+
+	/* static works are used, process one request at a time */
+	mutex_lock(&stop_cpus_mutex);
+	ret = __stop_cpus(cpumask, fn, arg);
+	mutex_unlock(&stop_cpus_mutex);
+	return ret;
+}
+
+/**
+ * try_stop_cpus - try to stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Identical to stop_cpus() except that it fails with -EAGAIN if
+ * someone else is already using the facility.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -EAGAIN if someone else is already stopping cpus, -ENOENT if
+ * @fn(@arg) was not executed at all because all cpus in @cpumask were
+ * offline; otherwise, 0 if all executions of @fn returned 0, any non
+ * zero return value if any returned non zero.
+ */
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+	int ret;
+
+	/* static works are used, process one request at a time */
+	if (!mutex_trylock(&stop_cpus_mutex))
+		return -EAGAIN;
+	ret = __stop_cpus(cpumask, fn, arg);
+	mutex_unlock(&stop_cpus_mutex);
+	return ret;
+}
+
+static int cpu_stopper_thread(void *data)
+{
+	struct cpu_stopper *stopper = data;
+	struct cpu_stop_work *work;
+	int ret;
+
+repeat:
+	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+
+	if (kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		return 0;
+	}
+
+	work = NULL;
+	spin_lock_irq(&stopper->lock);
+	if (!list_empty(&stopper->works)) {
+		work = list_first_entry(&stopper->works,
+					struct cpu_stop_work, list);
+		list_del_init(&work->list);
+	}
+	spin_unlock_irq(&stopper->lock);
+
+	if (work) {
+		cpu_stop_fn_t fn = work->fn;
+		void *arg = work->arg;
+		struct cpu_stop_done *done = work->done;
+		char ksym_buf[KSYM_NAME_LEN];
+
+		__set_current_state(TASK_RUNNING);
+
+		/* cpu stop callbacks are not allowed to sleep */
+		preempt_disable();
+
+		ret = fn(arg);
+		if (ret)
+			done->ret = ret;
+
+		/* restore preemption and check it's still balanced */
+		preempt_enable();
+		WARN_ONCE(preempt_count(),
+			  "cpu_stop: %s(%p) leaked preempt count\n",
+			  kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
+					  ksym_buf), arg);
+
+		cpu_stop_signal_done(done, true);
+	} else
+		schedule();
+
+	goto repeat;
+}
+
+/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
+static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
+					   unsigned long action, void *hcpu)
+{
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+	unsigned int cpu = (unsigned long)hcpu;
+	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+	struct task_struct *p;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		BUG_ON(stopper->thread || stopper->enabled ||
+		       !list_empty(&stopper->works));
+		p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
+				   cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+		get_task_struct(p);
+		stopper->thread = p;
+		break;
+
+	case CPU_ONLINE:
+		kthread_bind(stopper->thread, cpu);
+		/* strictly unnecessary, as first user will wake it */
+		wake_up_process(stopper->thread);
+		/* mark enabled */
+		spin_lock_irq(&stopper->lock);
+		stopper->enabled = true;
+		spin_unlock_irq(&stopper->lock);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+	case CPU_POST_DEAD:
+	{
+		struct cpu_stop_work *work;
+
+		/* kill the stopper */
+		kthread_stop(stopper->thread);
+		/* drain remaining works */
+		spin_lock_irq(&stopper->lock);
+		list_for_each_entry(work, &stopper->works, list)
+			cpu_stop_signal_done(work->done, false);
+		stopper->enabled = false;
+		spin_unlock_irq(&stopper->lock);
+		/* release the stopper */
+		put_task_struct(stopper->thread);
+		stopper->thread = NULL;
+		break;
+	}
+#endif
+	}
+
+	return NOTIFY_OK;
+}
+
+/*
+ * Give it a higher priority so that cpu stopper is available to other
+ * cpu notifiers.  It currently shares the same priority as sched
+ * migration_notifier.
+ */
+static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
+	.notifier_call	= cpu_stop_cpu_callback,
+	.priority	= 10,
+};
+
+static int __init cpu_stop_init(void)
+{
+	void *bcpu = (void *)(long)smp_processor_id();
+	unsigned int cpu;
+	int err;
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+
+		spin_lock_init(&stopper->lock);
+		INIT_LIST_HEAD(&stopper->works);
+	}
+
+	/* start one for the boot cpu */
+	err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
+				    bcpu);
+	BUG_ON(err == NOTIFY_BAD);
+	cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
+	register_cpu_notifier(&cpu_stop_cpu_notifier);
+
+	return 0;
+}
+early_initcall(cpu_stop_init);
+
+#ifdef CONFIG_STOP_MACHINE
 
 /* This controls the threads on each CPU. */
 enum stopmachine_state {
@@ -26,174 +393,94 @@ enum stopmachine_state {
 	/* Exit */
 	STOPMACHINE_EXIT,
 };
-static enum stopmachine_state state;
 
 struct stop_machine_data {
-	int (*fn)(void *);
-	void *data;
-	int fnret;
+	int			(*fn)(void *);
+	void			*data;
+	/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+	unsigned int		num_threads;
+	const struct cpumask	*active_cpus;
+
+	enum stopmachine_state	state;
+	atomic_t		thread_ack;
 };
 
-/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
-static atomic_t thread_ack;
-static DEFINE_MUTEX(lock);
-/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
-static DEFINE_MUTEX(setup_lock);
-/* Users of stop_machine. */
-static int refcount;
-static struct workqueue_struct *stop_machine_wq;
-static struct stop_machine_data active, idle;
-static const struct cpumask *active_cpus;
-static void *stop_machine_work;
-
-static void set_state(enum stopmachine_state newstate)
+static void set_state(struct stop_machine_data *smdata,
+		      enum stopmachine_state newstate)
 {
 	/* Reset ack counter. */
-	atomic_set(&thread_ack, num_threads);
+	atomic_set(&smdata->thread_ack, smdata->num_threads);
 	smp_wmb();
-	state = newstate;
+	smdata->state = newstate;
 }
 
 /* Last one to ack a state moves to the next state. */
-static void ack_state(void)
+static void ack_state(struct stop_machine_data *smdata)
 {
-	if (atomic_dec_and_test(&thread_ack))
-		set_state(state + 1);
+	if (atomic_dec_and_test(&smdata->thread_ack))
+		set_state(smdata, smdata->state + 1);
 }
 
-/* This is the actual function which stops the CPU. It runs
- * in the context of a dedicated stopmachine workqueue. */
-static void stop_cpu(struct work_struct *unused)
+/* This is the cpu_stop function which stops the CPU. */
+static int stop_machine_cpu_stop(void *data)
 {
+	struct stop_machine_data *smdata = data;
 	enum stopmachine_state curstate = STOPMACHINE_NONE;
-	struct stop_machine_data *smdata = &idle;
-	int cpu = smp_processor_id();
-	int err;
+	int cpu = smp_processor_id(), err = 0;
+	bool is_active;
+
+	if (!smdata->active_cpus)
+		is_active = cpu == cpumask_first(cpu_online_mask);
+	else
+		is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
 
-	if (!active_cpus) {
-		if (cpu == cpumask_first(cpu_online_mask))
-			smdata = &active;
-	} else {
-		if (cpumask_test_cpu(cpu, active_cpus))
-			smdata = &active;
-	}
 	/* Simple state machine */
 	do {
 		/* Chill out and ensure we re-read stopmachine_state. */
 		cpu_relax();
-		if (state != curstate) {
-			curstate = state;
+		if (smdata->state != curstate) {
+			curstate = smdata->state;
 			switch (curstate) {
 			case STOPMACHINE_DISABLE_IRQ:
 				local_irq_disable();
 				hard_irq_disable();
 				break;
 			case STOPMACHINE_RUN:
-				/* On multiple CPUs only a single error code
-				 * is needed to tell that something failed. */
-				err = smdata->fn(smdata->data);
-				if (err)
-					smdata->fnret = err;
+				if (is_active)
+					err = smdata->fn(smdata->data);
 				break;
 			default:
 				break;
 			}
-			ack_state();
+			ack_state(smdata);
 		}
 	} while (curstate != STOPMACHINE_EXIT);
 
 	local_irq_enable();
+	return err;
 }
 
-/* Callback for CPUs which aren't supposed to do anything. */
-static int chill(void *unused)
-{
-	return 0;
-}
-
-int stop_machine_create(void)
-{
-	mutex_lock(&setup_lock);
-	if (refcount)
-		goto done;
-	stop_machine_wq = create_rt_workqueue("kstop");
-	if (!stop_machine_wq)
-		goto err_out;
-	stop_machine_work = alloc_percpu(struct work_struct);
-	if (!stop_machine_work)
-		goto err_out;
-done:
-	refcount++;
-	mutex_unlock(&setup_lock);
-	return 0;
-
-err_out:
-	if (stop_machine_wq)
-		destroy_workqueue(stop_machine_wq);
-	mutex_unlock(&setup_lock);
-	return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(stop_machine_create);
-
-void stop_machine_destroy(void)
-{
-	mutex_lock(&setup_lock);
-	refcount--;
-	if (refcount)
-		goto done;
-	destroy_workqueue(stop_machine_wq);
-	free_percpu(stop_machine_work);
-done:
-	mutex_unlock(&setup_lock);
-}
-EXPORT_SYMBOL_GPL(stop_machine_destroy);
-
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
-	struct work_struct *sm_work;
-	int i, ret;
-
-	/* Set up initial state. */
-	mutex_lock(&lock);
-	num_threads = num_online_cpus();
-	active_cpus = cpus;
-	active.fn = fn;
-	active.data = data;
-	active.fnret = 0;
-	idle.fn = chill;
-	idle.data = NULL;
-
-	set_state(STOPMACHINE_PREPARE);
-
-	/* Schedule the stop_cpu work on all cpus: hold this CPU so one
-	 * doesn't hit this CPU until we're ready. */
-	get_cpu();
-	for_each_online_cpu(i) {
-		sm_work = per_cpu_ptr(stop_machine_work, i);
-		INIT_WORK(sm_work, stop_cpu);
-		queue_work_on(i, stop_machine_wq, sm_work);
-	}
-	/* This will release the thread on our CPU. */
-	put_cpu();
-	flush_workqueue(stop_machine_wq);
-	ret = active.fnret;
-	mutex_unlock(&lock);
-	return ret;
+	struct stop_machine_data smdata = { .fn = fn, .data = data,
+					    .num_threads = num_online_cpus(),
+					    .active_cpus = cpus };
+
+	/* Set the initial state and stop all online cpus. */
+	set_state(&smdata, STOPMACHINE_PREPARE);
+	return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
 }
 
 int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
 	int ret;
 
-	ret = stop_machine_create();
-	if (ret)
-		return ret;
 	/* No CPUs can come up or down during this. */
 	get_online_cpus();
 	ret = __stop_machine(fn, data, cpus);
 	put_online_cpus();
-	stop_machine_destroy();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(stop_machine);
+
+#endif	/* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index 26a6b73a6b8..7f5a0cd296a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,8 +33,10 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
+#include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
+#include <linux/gfp.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -222,6 +224,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
 	if (which > PRIO_USER || which < PRIO_PROCESS)
 		return -EINVAL;
 
+	rcu_read_lock();
 	read_lock(&tasklist_lock);
 	switch (which) {
 		case PRIO_PROCESS:
@@ -267,6 +270,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
 	}
 out_unlock:
 	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return retval;
 }
@@ -488,10 +492,6 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 		return -ENOMEM;
 	old = current_cred();
 
-	retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
-	if (retval)
-		goto error;
-
 	retval = -EPERM;
 	if (rgid != (gid_t) -1) {
 		if (old->gid == rgid ||
@@ -539,10 +539,6 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
 		return -ENOMEM;
 	old = current_cred();
 
-	retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
-	if (retval)
-		goto error;
-
 	retval = -EPERM;
 	if (capable(CAP_SETGID))
 		new->gid = new->egid = new->sgid = new->fsgid = gid;
@@ -569,13 +565,7 @@ static int set_user(struct cred *new)
 	if (!new_user)
 		return -EAGAIN;
 
-	if (!task_can_switch_user(new_user, current)) {
-		free_uid(new_user);
-		return -EINVAL;
-	}
-
-	if (atomic_read(&new_user->processes) >=
-				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
+	if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
 			new_user != INIT_USER) {
 		free_uid(new_user);
 		return -EAGAIN;
@@ -612,10 +602,6 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 		return -ENOMEM;
 	old = current_cred();
 
-	retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
-	if (retval)
-		goto error;
-
 	retval = -EPERM;
 	if (ruid != (uid_t) -1) {
 		new->uid = ruid;
@@ -677,10 +663,6 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
 		return -ENOMEM;
 	old = current_cred();
 
-	retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
-	if (retval)
-		goto error;
-
 	retval = -EPERM;
 	if (capable(CAP_SETUID)) {
 		new->suid = new->uid = uid;
@@ -721,9 +703,6 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 	if (!new)
 		return -ENOMEM;
 
-	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
-	if (retval)
-		goto error;
 	old = current_cred();
 
 	retval = -EPERM;
@@ -790,10 +769,6 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 		return -ENOMEM;
 	old = current_cred();
 
-	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
-	if (retval)
-		goto error;
-
 	retval = -EPERM;
 	if (!capable(CAP_SETGID)) {
 		if (rgid != (gid_t) -1 && rgid != old->gid &&
@@ -853,9 +828,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 	old = current_cred();
 	old_fsuid = old->fsuid;
 
-	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
-		goto error;
-
 	if (uid == old->uid  || uid == old->euid  ||
 	    uid == old->suid || uid == old->fsuid ||
 	    capable(CAP_SETUID)) {
@@ -866,7 +838,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 		}
 	}
 
-error:
 	abort_creds(new);
 	return old_fsuid;
 
@@ -890,9 +861,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 	old = current_cred();
 	old_fsgid = old->fsgid;
 
-	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
-		goto error;
-
 	if (gid == old->gid  || gid == old->egid  ||
 	    gid == old->sgid || gid == old->fsgid ||
 	    capable(CAP_SETGID)) {
@@ -902,7 +870,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 		}
 	}
 
-error:
 	abort_creds(new);
 	return old_fsgid;
 
@@ -964,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 		pgid = pid;
 	if (pgid < 0)
 		return -EINVAL;
+	rcu_read_lock();
 
 	/* From this point forward we keep holding onto the tasklist lock
 	 * so that our parent does not change from under us. -DaveM
@@ -1017,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 out:
 	/* All paths lead to here, thus we are safe. -DaveM */
 	write_unlock_irq(&tasklist_lock);
+	rcu_read_unlock();
 	return err;
 }
 
@@ -1118,6 +1087,15 @@ out:
 
 DECLARE_RWSEM(uts_sem);
 
+#ifdef COMPAT_UTS_MACHINE
+#define override_architecture(name) \
+	(personality(current->personality) == PER_LINUX32 && \
+	 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
+		      sizeof(COMPAT_UTS_MACHINE)))
+#else
+#define override_architecture(name)	0
+#endif
+
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
 {
 	int errno = 0;
@@ -1126,9 +1104,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
 	if (copy_to_user(name, utsname(), sizeof *name))
 		errno = -EFAULT;
 	up_read(&uts_sem);
+
+	if (!errno && override_architecture(name))
+		errno = -EFAULT;
 	return errno;
 }
 
+#ifdef __ARCH_WANT_SYS_OLD_UNAME
+/*
+ * Old cruft
+ */
+SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
+{
+	int error = 0;
+
+	if (!name)
+		return -EFAULT;
+
+	down_read(&uts_sem);
+	if (copy_to_user(name, utsname(), sizeof(*name)))
+		error = -EFAULT;
+	up_read(&uts_sem);
+
+	if (!error && override_architecture(name))
+		error = -EFAULT;
+	return error;
+}
+
+SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
+{
+	int error;
+
+	if (!name)
+		return -EFAULT;
+	if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
+		return -EFAULT;
+
+	down_read(&uts_sem);
+	error = __copy_to_user(&name->sysname, &utsname()->sysname,
+			       __OLD_UTS_LEN);
+	error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
+	error |= __copy_to_user(&name->nodename, &utsname()->nodename,
+				__OLD_UTS_LEN);
+	error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
+	error |= __copy_to_user(&name->release, &utsname()->release,
+				__OLD_UTS_LEN);
+	error |= __put_user(0, name->release + __OLD_UTS_LEN);
+	error |= __copy_to_user(&name->version, &utsname()->version,
+				__OLD_UTS_LEN);
+	error |= __put_user(0, name->version + __OLD_UTS_LEN);
+	error |= __copy_to_user(&name->machine, &utsname()->machine,
+				__OLD_UTS_LEN);
+	error |= __put_user(0, name->machine + __OLD_UTS_LEN);
+	up_read(&uts_sem);
+
+	if (!error && override_architecture(name))
+		error = -EFAULT;
+	return error ? -EFAULT : 0;
+}
+#endif
+
 SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 {
 	int errno;
@@ -1203,15 +1238,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 
 SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 {
-	if (resource >= RLIM_NLIMITS)
-		return -EINVAL;
-	else {
-		struct rlimit value;
-		task_lock(current->group_leader);
-		value = current->signal->rlim[resource];
-		task_unlock(current->group_leader);
-		return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
-	}
+	struct rlimit value;
+	int ret;
+
+	ret = do_prlimit(current, resource, NULL, &value);
+	if (!ret)
+		ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
+
+	return ret;
 }
 
 #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
@@ -1239,44 +1273,89 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
 
 #endif
 
-SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+static inline bool rlim64_is_infinity(__u64 rlim64)
 {
-	struct rlimit new_rlim, *old_rlim;
-	int retval;
+#if BITS_PER_LONG < 64
+	return rlim64 >= ULONG_MAX;
+#else
+	return rlim64 == RLIM64_INFINITY;
+#endif
+}
+
+static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
+{
+	if (rlim->rlim_cur == RLIM_INFINITY)
+		rlim64->rlim_cur = RLIM64_INFINITY;
+	else
+		rlim64->rlim_cur = rlim->rlim_cur;
+	if (rlim->rlim_max == RLIM_INFINITY)
+		rlim64->rlim_max = RLIM64_INFINITY;
+	else
+		rlim64->rlim_max = rlim->rlim_max;
+}
+
+static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
+{
+	if (rlim64_is_infinity(rlim64->rlim_cur))
+		rlim->rlim_cur = RLIM_INFINITY;
+	else
+		rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
+	if (rlim64_is_infinity(rlim64->rlim_max))
+		rlim->rlim_max = RLIM_INFINITY;
+	else
+		rlim->rlim_max = (unsigned long)rlim64->rlim_max;
+}
+
+/* make sure you are allowed to change @tsk limits before calling this */
+int do_prlimit(struct task_struct *tsk, unsigned int resource,
+		struct rlimit *new_rlim, struct rlimit *old_rlim)
+{
+	struct rlimit *rlim;
+	int retval = 0;
 
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
-	if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
-		return -EFAULT;
-	if (new_rlim.rlim_cur > new_rlim.rlim_max)
-		return -EINVAL;
-	old_rlim = current->signal->rlim + resource;
-	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
-	    !capable(CAP_SYS_RESOURCE))
-		return -EPERM;
-	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
-		return -EPERM;
-
-	retval = security_task_setrlimit(resource, &new_rlim);
-	if (retval)
-		return retval;
-
-	if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
-		/*
-		 * The caller is asking for an immediate RLIMIT_CPU
-		 * expiry.  But we use the zero value to mean "it was
-		 * never set".  So let's cheat and make it one second
-		 * instead
-		 */
-		new_rlim.rlim_cur = 1;
+	if (new_rlim) {
+		if (new_rlim->rlim_cur > new_rlim->rlim_max)
+			return -EINVAL;
+		if (resource == RLIMIT_NOFILE &&
+				new_rlim->rlim_max > sysctl_nr_open)
+			return -EPERM;
 	}
 
-	task_lock(current->group_leader);
-	*old_rlim = new_rlim;
-	task_unlock(current->group_leader);
-
-	if (resource != RLIMIT_CPU)
+	/* protect tsk->signal and tsk->sighand from disappearing */
+	read_lock(&tasklist_lock);
+	if (!tsk->sighand) {
+		retval = -ESRCH;
 		goto out;
+	}
+
+	rlim = tsk->signal->rlim + resource;
+	task_lock(tsk->group_leader);
+	if (new_rlim) {
+		if (new_rlim->rlim_max > rlim->rlim_max &&
+				!capable(CAP_SYS_RESOURCE))
+			retval = -EPERM;
+		if (!retval)
+			retval = security_task_setrlimit(tsk->group_leader,
+					resource, new_rlim);
+		if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
+			/*
+			 * The caller is asking for an immediate RLIMIT_CPU
+			 * expiry.  But we use the zero value to mean "it was
+			 * never set".  So let's cheat and make it one second
+			 * instead
+			 */
+			new_rlim->rlim_cur = 1;
+		}
+	}
+	if (!retval) {
+		if (old_rlim)
+			*old_rlim = *rlim;
+		if (new_rlim)
+			*rlim = *new_rlim;
+	}
+	task_unlock(tsk->group_leader);
 
 	/*
 	 * RLIMIT_CPU handling.   Note that the kernel fails to return an error
@@ -1284,14 +1363,84 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 	 * very long-standing error, and fixing it now risks breakage of
 	 * applications, so we live with it
 	 */
-	if (new_rlim.rlim_cur == RLIM_INFINITY)
-		goto out;
-
-	update_rlimit_cpu(new_rlim.rlim_cur);
+	 if (!retval && new_rlim && resource == RLIMIT_CPU &&
+			 new_rlim->rlim_cur != RLIM_INFINITY)
+		update_rlimit_cpu(tsk, new_rlim->rlim_cur);
 out:
+	read_unlock(&tasklist_lock);
+	return retval;
+}
+
+/* rcu lock must be held */
+static int check_prlimit_permission(struct task_struct *task)
+{
+	const struct cred *cred = current_cred(), *tcred;
+
+	tcred = __task_cred(task);
+	if ((cred->uid != tcred->euid ||
+	     cred->uid != tcred->suid ||
+	     cred->uid != tcred->uid  ||
+	     cred->gid != tcred->egid ||
+	     cred->gid != tcred->sgid ||
+	     cred->gid != tcred->gid) &&
+	     !capable(CAP_SYS_RESOURCE)) {
+		return -EPERM;
+	}
+
 	return 0;
 }
 
+SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
+		const struct rlimit64 __user *, new_rlim,
+		struct rlimit64 __user *, old_rlim)
+{
+	struct rlimit64 old64, new64;
+	struct rlimit old, new;
+	struct task_struct *tsk;
+	int ret;
+
+	if (new_rlim) {
+		if (copy_from_user(&new64, new_rlim, sizeof(new64)))
+			return -EFAULT;
+		rlim64_to_rlim(&new64, &new);
+	}
+
+	rcu_read_lock();
+	tsk = pid ? find_task_by_vpid(pid) : current;
+	if (!tsk) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+	ret = check_prlimit_permission(tsk);
+	if (ret) {
+		rcu_read_unlock();
+		return ret;
+	}
+	get_task_struct(tsk);
+	rcu_read_unlock();
+
+	ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
+			old_rlim ? &old : NULL);
+
+	if (!ret && old_rlim) {
+		rlim_to_rlim64(&old, &old64);
+		if (copy_to_user(old_rlim, &old64, sizeof(old64)))
+			ret = -EFAULT;
+	}
+
+	put_task_struct(tsk);
+	return ret;
+}
+
+SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+{
+	struct rlimit new_rlim;
+
+	if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+		return -EFAULT;
+	return do_prlimit(current, resource, &new_rlim, NULL);
+}
+
 /*
  * It would make sense to put struct rusage in the task_struct,
  * except that would make the task_struct be *really big*.  After
@@ -1599,9 +1748,9 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
 
 char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
 
-static void argv_cleanup(char **argv, char **envp)
+static void argv_cleanup(struct subprocess_info *info)
 {
-	argv_free(argv);
+	argv_free(info->argv);
 }
 
 /**
@@ -1635,7 +1784,7 @@ int orderly_poweroff(bool force)
 		goto out;
 	}
 
-	call_usermodehelper_setcleanup(info, argv_cleanup);
+	call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
 
 	ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 695384f12a7..bad369ec540 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16);
 cond_syscall(sys_setuid16);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
+cond_syscall(sys_ipc);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
 cond_syscall(sys_flock);
@@ -180,3 +181,7 @@ cond_syscall(sys_eventfd2);
 
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
+
+/* fanotify! */
+cond_syscall(sys_fanotify_init);
+cond_syscall(sys_fanotify_mark);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b244846..3a45c224770 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
+#include <linux/signal.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -36,20 +37,24 @@
 #include <linux/highuid.h>
 #include <linux/writeback.h>
 #include <linux/ratelimit.h>
+#include <linux/compaction.h>
 #include <linux/hugetlb.h>
 #include <linux/initrd.h>
 #include <linux/key.h>
 #include <linux/times.h>
 #include <linux/limits.h>
 #include <linux/dcache.h>
+#include <linux/dnotify.h>
 #include <linux/syscalls.h>
 #include <linux/vmstat.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
-#include <linux/slow-work.h>
 #include <linux/perf_event.h>
+#include <linux/kprobes.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -59,18 +64,29 @@
 #include <asm/stacktrace.h>
 #include <asm/io.h>
 #endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+#include <linux/acct.h>
+#endif
+#ifdef CONFIG_RT_MUTEXES
+#include <linux/rtmutex.h>
+#endif
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
+#include <linux/lockdep.h>
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+#include <scsi/sg.h>
+#endif
+
+#ifdef CONFIG_LOCKUP_DETECTOR
+#include <linux/nmi.h>
+#endif
 
 
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
-extern int C_A_D;
-extern int print_fatal_signals;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
-extern int sysctl_panic_on_oom;
-extern int sysctl_oom_kill_allocating_task;
-extern int sysctl_oom_dump_tasks;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
@@ -87,15 +103,12 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifndef CONFIG_MMU
 extern int sysctl_nr_trim_pages;
 #endif
-#ifdef CONFIG_RCU_TORTURE_TEST
-extern int rcutorture_runnable;
-#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 #ifdef CONFIG_BLOCK
 extern int blk_iopoll_enabled;
 #endif
 
 /* Constants used for minimum and  maximum */
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_LOCKUP_DETECTOR
 static int sixty = 60;
 static int neg_one = -1;
 #endif
@@ -119,14 +132,9 @@ static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 
-#ifdef CONFIG_MODULES
-extern char modprobe_path[];
-extern int modules_disabled;
-#endif
-#ifdef CONFIG_CHR_DEV_SG
-extern int sg_big_buff;
+#ifdef CONFIG_INOTIFY_USER
+#include <linux/inotify.h>
 #endif
-
 #ifdef CONFIG_SPARC
 #include <asm/system.h>
 #endif
@@ -148,10 +156,6 @@ extern int sysctl_userprocess_debug;
 extern int spin_retry;
 #endif
 
-#ifdef CONFIG_BSD_PROCESS_ACCT
-extern int acct_parm[];
-#endif
-
 #ifdef CONFIG_IA64
 extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
@@ -159,10 +163,6 @@ extern int unaligned_dump_stack;
 
 extern struct ratelimit_state printk_ratelimit_state;
 
-#ifdef CONFIG_RT_MUTEXES
-extern int max_lock_depth;
-#endif
-
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -170,6 +170,27 @@ static int proc_taint(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_MAGIC_SYSRQ
+static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
+
+static int sysrq_sysctl_handler(ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int error;
+
+	error = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (error)
+		return error;
+
+	if (write)
+		sysrq_toggle_support(__sysrq_enabled);
+
+	return 0;
+}
+
+#endif
+
 static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
@@ -190,9 +211,6 @@ static struct ctl_table fs_table[];
 static struct ctl_table debug_table[];
 static struct ctl_table dev_table[];
 extern struct ctl_table random_table[];
-#ifdef CONFIG_INOTIFY_USER
-extern struct ctl_table inotify_table[];
-#endif
 #ifdef CONFIG_EPOLL
 extern struct ctl_table epoll_table[];
 #endif
@@ -201,9 +219,6 @@ extern struct ctl_table epoll_table[];
 int sysctl_legacy_va_layout;
 #endif
 
-extern int prove_locking;
-extern int lock_stat;
-
 /* The default sysctl tables: */
 
 static struct ctl_table root_table[] = {
@@ -250,6 +265,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */
 static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
 
+#ifdef CONFIG_COMPACTION
+static int min_extfrag_threshold;
+static int max_extfrag_threshold = 1000;
+#endif
+
 static struct ctl_table kern_table[] = {
 	{
 		.procname	= "sched_child_runs_first",
@@ -544,7 +564,7 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one,
 	},
 #endif
-#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+#ifdef CONFIG_HOTPLUG
 	{
 		.procname	= "hotplug",
 		.data		= &uevent_helper,
@@ -577,7 +597,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &__sysrq_enabled,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= sysrq_sysctl_handler,
 	},
 #endif
 #ifdef CONFIG_PROC_SYSCTL
@@ -631,7 +651,7 @@ static struct ctl_table kern_table[] = {
 #endif
 	{
 		.procname	= "userprocess_debug",
-		.data		= &sysctl_userprocess_debug,
+		.data		= &show_unhandled_signals,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -692,7 +712,34 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
 	},
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+#if defined(CONFIG_LOCKUP_DETECTOR)
+	{
+		.procname       = "watchdog",
+		.data           = &watchdog_enabled,
+		.maxlen         = sizeof (int),
+		.mode           = 0644,
+		.proc_handler   = proc_dowatchdog_enabled,
+	},
+	{
+		.procname	= "watchdog_thresh",
+		.data		= &softlockup_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dowatchdog_thresh,
+		.extra1		= &neg_one,
+		.extra2		= &sixty,
+	},
+	{
+		.procname	= "softlockup_panic",
+		.data		= &softlockup_panic,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
 	{
 		.procname       = "unknown_nmi_panic",
 		.data           = &unknown_nmi_panic,
@@ -795,26 +842,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_DETECT_SOFTLOCKUP
-	{
-		.procname	= "softlockup_panic",
-		.data		= &softlockup_panic,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
-	},
-	{
-		.procname	= "softlockup_thresh",
-		.data		= &softlockup_thresh,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dosoftlockup_thresh,
-		.extra1		= &neg_one,
-		.extra2		= &sixty,
-	},
-#endif
 #ifdef CONFIG_DETECT_HUNG_TASK
 	{
 		.procname	= "hung_task_panic",
@@ -888,13 +915,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_SLOW_WORK
-	{
-		.procname	= "slow-work",
-		.mode		= 0555,
-		.child		= slow_work_sysctls,
-	},
-#endif
 #ifdef CONFIG_PERF_EVENTS
 	{
 		.procname	= "perf_event_paranoid",
@@ -1109,6 +1129,25 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= drop_caches_sysctl_handler,
 	},
+#ifdef CONFIG_COMPACTION
+	{
+		.procname	= "compact_memory",
+		.data		= &sysctl_compact_memory,
+		.maxlen		= sizeof(int),
+		.mode		= 0200,
+		.proc_handler	= sysctl_compaction_handler,
+	},
+	{
+		.procname	= "extfrag_threshold",
+		.data		= &sysctl_extfrag_threshold,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_extfrag_handler,
+		.extra1		= &min_extfrag_threshold,
+		.extra2		= &max_extfrag_threshold,
+	},
+
+#endif /* CONFIG_COMPACTION */
 	{
 		.procname	= "min_free_kbytes",
 		.data		= &min_free_kbytes,
@@ -1433,6 +1472,14 @@ static struct ctl_table fs_table[] = {
 		.child		= binfmt_misc_table,
 	},
 #endif
+	{
+		.procname	= "pipe-max-size",
+		.data		= &pipe_max_size,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &pipe_proc_fn,
+		.extra1		= &pipe_min_size,
+	},
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
@@ -1441,7 +1488,8 @@ static struct ctl_table fs_table[] = {
 };
 
 static struct ctl_table debug_table[] = {
-#if defined(CONFIG_X86) || defined(CONFIG_PPC)
+#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
+    defined(CONFIG_S390)
 	{
 		.procname	= "exception-trace",
 		.data		= &show_unhandled_signals,
@@ -1450,6 +1498,17 @@ static struct ctl_table debug_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 #endif
+#if defined(CONFIG_OPTPROBES)
+	{
+		.procname	= "kprobes-optimization",
+		.data		= &sysctl_kprobes_optimization,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_kprobes_optimization_handler,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 	{ }
 };
 
@@ -1654,10 +1713,7 @@ static __init int sysctl_init(void)
 {
 	sysctl_set_parent(NULL, root_table);
 #ifdef CONFIG_SYSCTL_SYSCALL_CHECK
-	{
-		int err;
-		err = sysctl_check_table(current->nsproxy, root_table);
-	}
+	sysctl_check_table(current->nsproxy, root_table);
 #endif
 	return 0;
 }
@@ -2039,8 +2095,132 @@ int proc_dostring(struct ctl_table *table, int write,
 			       buffer, lenp, ppos);
 }
 
+static size_t proc_skip_spaces(char **buf)
+{
+	size_t ret;
+	char *tmp = skip_spaces(*buf);
+	ret = tmp - *buf;
+	*buf = tmp;
+	return ret;
+}
+
+static void proc_skip_char(char **buf, size_t *size, const char v)
+{
+	while (*size) {
+		if (**buf != v)
+			break;
+		(*size)--;
+		(*buf)++;
+	}
+}
+
+#define TMPBUFLEN 22
+/**
+ * proc_get_long - reads an ASCII formatted integer from a user buffer
+ *
+ * @buf: a kernel buffer
+ * @size: size of the kernel buffer
+ * @val: this is where the number will be stored
+ * @neg: set to %TRUE if number is negative
+ * @perm_tr: a vector which contains the allowed trailers
+ * @perm_tr_len: size of the perm_tr vector
+ * @tr: pointer to store the trailer character
+ *
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes read. If @tr is non-NULL and a trailing
+ * character exists (size is non-zero after returning from this
+ * function), @tr is updated with the trailing character.
+ */
+static int proc_get_long(char **buf, size_t *size,
+			  unsigned long *val, bool *neg,
+			  const char *perm_tr, unsigned perm_tr_len, char *tr)
+{
+	int len;
+	char *p, tmp[TMPBUFLEN];
+
+	if (!*size)
+		return -EINVAL;
+
+	len = *size;
+	if (len > TMPBUFLEN - 1)
+		len = TMPBUFLEN - 1;
+
+	memcpy(tmp, *buf, len);
+
+	tmp[len] = 0;
+	p = tmp;
+	if (*p == '-' && *size > 1) {
+		*neg = true;
+		p++;
+	} else
+		*neg = false;
+	if (!isdigit(*p))
+		return -EINVAL;
+
+	*val = simple_strtoul(p, &p, 0);
+
+	len = p - tmp;
+
+	/* We don't know if the next char is whitespace thus we may accept
+	 * invalid integers (e.g. 1234...a) or two integers instead of one
+	 * (e.g. 123...1). So lets not allow such large numbers. */
+	if (len == TMPBUFLEN - 1)
+		return -EINVAL;
+
+	if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
+		return -EINVAL;
+
+	if (tr && (len < *size))
+		*tr = *p;
+
+	*buf += len;
+	*size -= len;
+
+	return 0;
+}
+
+/**
+ * proc_put_long - converts an integer to a decimal ASCII formatted string
+ *
+ * @buf: the user buffer
+ * @size: the size of the user buffer
+ * @val: the integer to be converted
+ * @neg: sign of the number, %TRUE for negative
+ *
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes written.
+ */
+static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
+			  bool neg)
+{
+	int len;
+	char tmp[TMPBUFLEN], *p = tmp;
+
+	sprintf(p, "%s%lu", neg ? "-" : "", val);
+	len = strlen(tmp);
+	if (len > *size)
+		len = *size;
+	if (copy_to_user(*buf, tmp, len))
+		return -EFAULT;
+	*size -= len;
+	*buf += len;
+	return 0;
+}
+#undef TMPBUFLEN
+
+static int proc_put_char(void __user **buf, size_t *size, char c)
+{
+	if (*size) {
+		char __user **buffer = (char __user **)buf;
+		if (put_user(c, *buffer))
+			return -EFAULT;
+		(*size)--, (*buffer)++;
+		*buf = *buffer;
+	}
+	return 0;
+}
 
-static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 				 int *valp,
 				 int write, void *data)
 {
@@ -2049,33 +2229,31 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
 	} else {
 		int val = *valp;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			*lvalp = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			*lvalp = (unsigned long)val;
 		}
 	}
 	return 0;
 }
 
+static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
+
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 		  int write, void __user *buffer,
 		  size_t *lenp, loff_t *ppos,
-		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
-#define TMPBUFLEN 21
-	int *i, vleft, first = 1, neg;
-	unsigned long lval;
-	size_t left, len;
-	
-	char buf[TMPBUFLEN], *p;
-	char __user *s = buffer;
+	int *i, vleft, first = 1, err = 0;
+	unsigned long page = 0;
+	size_t left;
+	char *kbuf;
 	
-	if (!tbl_data || !table->maxlen || !*lenp ||
-	    (*ppos && !write)) {
+	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
 		*lenp = 0;
 		return 0;
 	}
@@ -2087,89 +2265,71 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 	if (!conv)
 		conv = do_proc_dointvec_conv;
 
+	if (write) {
+		if (left > PAGE_SIZE - 1)
+			left = PAGE_SIZE - 1;
+		page = __get_free_page(GFP_TEMPORARY);
+		kbuf = (char *) page;
+		if (!kbuf)
+			return -ENOMEM;
+		if (copy_from_user(kbuf, buffer, left)) {
+			err = -EFAULT;
+			goto free;
+		}
+		kbuf[left] = 0;
+	}
+
 	for (; left && vleft--; i++, first=0) {
+		unsigned long lval;
+		bool neg;
+
 		if (write) {
-			while (left) {
-				char c;
-				if (get_user(c, s))
-					return -EFAULT;
-				if (!isspace(c))
-					break;
-				left--;
-				s++;
-			}
+			left -= proc_skip_spaces(&kbuf);
+
 			if (!left)
 				break;
-			neg = 0;
-			len = left;
-			if (len > sizeof(buf) - 1)
-				len = sizeof(buf) - 1;
-			if (copy_from_user(buf, s, len))
-				return -EFAULT;
-			buf[len] = 0;
-			p = buf;
-			if (*p == '-' && left > 1) {
-				neg = 1;
-				p++;
-			}
-			if (*p < '0' || *p > '9')
+			err = proc_get_long(&kbuf, &left, &lval, &neg,
+					     proc_wspace_sep,
+					     sizeof(proc_wspace_sep), NULL);
+			if (err)
 				break;
-
-			lval = simple_strtoul(p, &p, 0);
-
-			len = p-buf;
-			if ((len < left) && *p && !isspace(*p))
-				break;
-			s += len;
-			left -= len;
-
-			if (conv(&neg, &lval, i, 1, data))
+			if (conv(&neg, &lval, i, 1, data)) {
+				err = -EINVAL;
 				break;
+			}
 		} else {
-			p = buf;
+			if (conv(&neg, &lval, i, 0, data)) {
+				err = -EINVAL;
+				break;
+			}
 			if (!first)
-				*p++ = '\t';
-	
-			if (conv(&neg, &lval, i, 0, data))
+				err = proc_put_char(&buffer, &left, '\t');
+			if (err)
+				break;
+			err = proc_put_long(&buffer, &left, lval, neg);
+			if (err)
 				break;
-
-			sprintf(p, "%s%lu", neg ? "-" : "", lval);
-			len = strlen(buf);
-			if (len > left)
-				len = left;
-			if(copy_to_user(s, buf, len))
-				return -EFAULT;
-			left -= len;
-			s += len;
 		}
 	}
 
-	if (!write && !first && left) {
-		if(put_user('\n', s))
-			return -EFAULT;
-		left--, s++;
-	}
+	if (!write && !first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
+	if (write && !err && left)
+		left -= proc_skip_spaces(&kbuf);
+free:
 	if (write) {
-		while (left) {
-			char c;
-			if (get_user(c, s++))
-				return -EFAULT;
-			if (!isspace(c))
-				break;
-			left--;
-		}
+		free_page(page);
+		if (first)
+			return err ? : -EINVAL;
 	}
-	if (write && first)
-		return -EINVAL;
 	*lenp -= left;
 	*ppos += *lenp;
-	return 0;
-#undef TMPBUFLEN
+	return err;
 }
 
 static int do_proc_dointvec(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos,
-		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
@@ -2237,8 +2397,8 @@ struct do_proc_dointvec_minmax_conv_param {
 	int *max;
 };
 
-static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 
-					int *valp, 
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+					int *valp,
 					int write, void *data)
 {
 	struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2251,10 +2411,10 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
 	} else {
 		int val = *valp;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			*lvalp = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			*lvalp = (unsigned long)val;
 		}
 	}
@@ -2294,102 +2454,78 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 				     unsigned long convmul,
 				     unsigned long convdiv)
 {
-#define TMPBUFLEN 21
-	unsigned long *i, *min, *max, val;
-	int vleft, first=1, neg;
-	size_t len, left;
-	char buf[TMPBUFLEN], *p;
-	char __user *s = buffer;
-	
-	if (!data || !table->maxlen || !*lenp ||
-	    (*ppos && !write)) {
+	unsigned long *i, *min, *max;
+	int vleft, first = 1, err = 0;
+	unsigned long page = 0;
+	size_t left;
+	char *kbuf;
+
+	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
 		*lenp = 0;
 		return 0;
 	}
-	
+
 	i = (unsigned long *) data;
 	min = (unsigned long *) table->extra1;
 	max = (unsigned long *) table->extra2;
 	vleft = table->maxlen / sizeof(unsigned long);
 	left = *lenp;
-	
-	for (; left && vleft--; i++, min++, max++, first=0) {
+
+	if (write) {
+		if (left > PAGE_SIZE - 1)
+			left = PAGE_SIZE - 1;
+		page = __get_free_page(GFP_TEMPORARY);
+		kbuf = (char *) page;
+		if (!kbuf)
+			return -ENOMEM;
+		if (copy_from_user(kbuf, buffer, left)) {
+			err = -EFAULT;
+			goto free;
+		}
+		kbuf[left] = 0;
+	}
+
+	for (; left && vleft--; i++, first = 0) {
+		unsigned long val;
+
 		if (write) {
-			while (left) {
-				char c;
-				if (get_user(c, s))
-					return -EFAULT;
-				if (!isspace(c))
-					break;
-				left--;
-				s++;
-			}
-			if (!left)
-				break;
-			neg = 0;
-			len = left;
-			if (len > TMPBUFLEN-1)
-				len = TMPBUFLEN-1;
-			if (copy_from_user(buf, s, len))
-				return -EFAULT;
-			buf[len] = 0;
-			p = buf;
-			if (*p == '-' && left > 1) {
-				neg = 1;
-				p++;
-			}
-			if (*p < '0' || *p > '9')
-				break;
-			val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
-			len = p-buf;
-			if ((len < left) && *p && !isspace(*p))
+			bool neg;
+
+			left -= proc_skip_spaces(&kbuf);
+
+			err = proc_get_long(&kbuf, &left, &val, &neg,
+					     proc_wspace_sep,
+					     sizeof(proc_wspace_sep), NULL);
+			if (err)
 				break;
 			if (neg)
-				val = -val;
-			s += len;
-			left -= len;
-
-			if(neg)
 				continue;
 			if ((min && val < *min) || (max && val > *max))
 				continue;
 			*i = val;
 		} else {
-			p = buf;
+			val = convdiv * (*i) / convmul;
 			if (!first)
-				*p++ = '\t';
-			sprintf(p, "%lu", convdiv * (*i) / convmul);
-			len = strlen(buf);
-			if (len > left)
-				len = left;
-			if(copy_to_user(s, buf, len))
-				return -EFAULT;
-			left -= len;
-			s += len;
+				err = proc_put_char(&buffer, &left, '\t');
+			err = proc_put_long(&buffer, &left, val, false);
+			if (err)
+				break;
 		}
 	}
 
-	if (!write && !first && left) {
-		if(put_user('\n', s))
-			return -EFAULT;
-		left--, s++;
-	}
+	if (!write && !first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
+	if (write && !err)
+		left -= proc_skip_spaces(&kbuf);
+free:
 	if (write) {
-		while (left) {
-			char c;
-			if (get_user(c, s++))
-				return -EFAULT;
-			if (!isspace(c))
-				break;
-			left--;
-		}
+		free_page(page);
+		if (first)
+			return err ? : -EINVAL;
 	}
-	if (write && first)
-		return -EINVAL;
 	*lenp -= left;
 	*ppos += *lenp;
-	return 0;
-#undef TMPBUFLEN
+	return err;
 }
 
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2450,7 +2586,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 }
 
 
-static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
 					 int *valp,
 					 int write, void *data)
 {
@@ -2462,10 +2598,10 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			lval = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			lval = (unsigned long)val;
 		}
 		*lvalp = lval / HZ;
@@ -2473,7 +2609,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
 	return 0;
 }
 
-static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
 						int *valp,
 						int write, void *data)
 {
@@ -2485,10 +2621,10 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			lval = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			lval = (unsigned long)val;
 		}
 		*lvalp = jiffies_to_clock_t(lval);
@@ -2496,7 +2632,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
 	return 0;
 }
 
-static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
 					    int *valp,
 					    int write, void *data)
 {
@@ -2506,10 +2642,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			lval = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			lval = (unsigned long)val;
 		}
 		*lvalp = jiffies_to_msecs(lval);
@@ -2606,6 +2742,157 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
 	return 0;
 }
 
+/**
+ * proc_do_large_bitmap - read/write from/to a large bitmap
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * The bitmap is stored at table->data and the bitmap length (in bits)
+ * in table->maxlen.
+ *
+ * We use a range comma separated format (e.g. 1,3-4,10-10) so that
+ * large bitmaps may be represented in a compact manner. Writing into
+ * the file will clear the bitmap then update it with the given input.
+ *
+ * Returns 0 on success.
+ */
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int err = 0;
+	bool first = 1;
+	size_t left = *lenp;
+	unsigned long bitmap_len = table->maxlen;
+	unsigned long *bitmap = (unsigned long *) table->data;
+	unsigned long *tmp_bitmap = NULL;
+	char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
+
+	if (!bitmap_len || !left || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+
+	if (write) {
+		unsigned long page = 0;
+		char *kbuf;
+
+		if (left > PAGE_SIZE - 1)
+			left = PAGE_SIZE - 1;
+
+		page = __get_free_page(GFP_TEMPORARY);
+		kbuf = (char *) page;
+		if (!kbuf)
+			return -ENOMEM;
+		if (copy_from_user(kbuf, buffer, left)) {
+			free_page(page);
+			return -EFAULT;
+                }
+		kbuf[left] = 0;
+
+		tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
+				     GFP_KERNEL);
+		if (!tmp_bitmap) {
+			free_page(page);
+			return -ENOMEM;
+		}
+		proc_skip_char(&kbuf, &left, '\n');
+		while (!err && left) {
+			unsigned long val_a, val_b;
+			bool neg;
+
+			err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
+					     sizeof(tr_a), &c);
+			if (err)
+				break;
+			if (val_a >= bitmap_len || neg) {
+				err = -EINVAL;
+				break;
+			}
+
+			val_b = val_a;
+			if (left) {
+				kbuf++;
+				left--;
+			}
+
+			if (c == '-') {
+				err = proc_get_long(&kbuf, &left, &val_b,
+						     &neg, tr_b, sizeof(tr_b),
+						     &c);
+				if (err)
+					break;
+				if (val_b >= bitmap_len || neg ||
+				    val_a > val_b) {
+					err = -EINVAL;
+					break;
+				}
+				if (left) {
+					kbuf++;
+					left--;
+				}
+			}
+
+			while (val_a <= val_b)
+				set_bit(val_a++, tmp_bitmap);
+
+			first = 0;
+			proc_skip_char(&kbuf, &left, '\n');
+		}
+		free_page(page);
+	} else {
+		unsigned long bit_a, bit_b = 0;
+
+		while (left) {
+			bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
+			if (bit_a >= bitmap_len)
+				break;
+			bit_b = find_next_zero_bit(bitmap, bitmap_len,
+						   bit_a + 1) - 1;
+
+			if (!first) {
+				err = proc_put_char(&buffer, &left, ',');
+				if (err)
+					break;
+			}
+			err = proc_put_long(&buffer, &left, bit_a, false);
+			if (err)
+				break;
+			if (bit_a != bit_b) {
+				err = proc_put_char(&buffer, &left, '-');
+				if (err)
+					break;
+				err = proc_put_long(&buffer, &left, bit_b, false);
+				if (err)
+					break;
+			}
+
+			first = 0; bit_b++;
+		}
+		if (!err)
+			err = proc_put_char(&buffer, &left, '\n');
+	}
+
+	if (!err) {
+		if (write) {
+			if (*ppos)
+				bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
+			else
+				memcpy(bitmap, tmp_bitmap,
+					BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
+		}
+		kfree(tmp_bitmap);
+		*lenp -= left;
+		*ppos += *lenp;
+		return 0;
+	} else {
+		kfree(tmp_bitmap);
+		return err;
+	}
+}
+
 #else /* CONFIG_PROC_FS */
 
 int proc_dostring(struct ctl_table *table, int write,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8f5d16e0707..1357c578606 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,8 @@
 #include <linux/file.h>
 #include <linux/ctype.h>
 #include <linux/netdevice.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
 
 #ifdef CONFIG_SYSCTL_SYSCALL
 
@@ -223,7 +225,6 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
 	{ CTL_INT,	NET_IPV4_ROUTE_MTU_EXPIRES,		"mtu_expires" },
 	{ CTL_INT,	NET_IPV4_ROUTE_MIN_PMTU,		"min_pmtu" },
 	{ CTL_INT,	NET_IPV4_ROUTE_MIN_ADVMSS,		"min_adv_mss" },
-	{ CTL_INT,	NET_IPV4_ROUTE_SECRET_INTERVAL,		"secret_interval" },
 	{}
 };
 
@@ -1124,11 +1125,6 @@ out:
 	return result;
 }
 
-static unsigned hex_value(int ch)
-{
-	return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
-}
-
 static ssize_t bin_uuid(struct file *file,
 	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
@@ -1156,7 +1152,8 @@ static ssize_t bin_uuid(struct file *file,
 			if (!isxdigit(str[0]) || !isxdigit(str[1]))
 				goto out;
 
-			uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]);
+			uuid[i] = (hex_to_bin(str[0]) << 4) |
+					hex_to_bin(str[1]);
 			str += 2;
 			if (*str == '-')
 				str++;
@@ -1331,7 +1328,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 	ssize_t result;
 	char *pathname;
 	int flags;
-	int acc_mode, fmode;
+	int acc_mode;
 
 	pathname = sysctl_getname(name, nlen, &table);
 	result = PTR_ERR(pathname);
@@ -1342,15 +1339,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 	if (oldval && oldlen && newval && newlen) {
 		flags = O_RDWR;
 		acc_mode = MAY_READ | MAY_WRITE;
-		fmode = FMODE_READ | FMODE_WRITE;
 	} else if (newval && newlen) {
 		flags = O_WRONLY;
 		acc_mode = MAY_WRITE;
-		fmode = FMODE_WRITE;
 	} else if (oldval && oldlen) {
 		flags = O_RDONLY;
 		acc_mode = MAY_READ;
-		fmode = FMODE_READ;
 	} else {
 		result = 0;
 		goto out_putname;
@@ -1361,7 +1355,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 	if (result)
 		goto out_putname;
 
-	result = may_open(&nd.path, acc_mode, fmode);
+	result = may_open(&nd.path, acc_mode, flags);
 	if (result)
 		goto out_putpath;
 
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 04cdcf72c82..10b90d8a03c 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 				if (!table->maxlen)
 					set_fail(&fail, table, "No maxlen");
 			}
-			if ((table->proc_handler == proc_doulongvec_minmax) ||
-			    (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
-				if (table->maxlen > sizeof (unsigned long)) {
-					if (!table->extra1)
-						set_fail(&fail, table, "No min");
-					if (!table->extra2)
-						set_fail(&fail, table, "No max");
-				}
-			}
 #ifdef CONFIG_PROC_SYSCTL
 			if (table->procname && !table->proc_handler)
 				set_fail(&fail, table, "No proc_handler");
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa..11281d5792b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,7 @@
 #include <linux/delayacct.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
+#include <linux/slab.h>
 #include <linux/cgroupstats.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
@@ -46,15 +47,13 @@ static struct genl_family family = {
 	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
 };
 
-static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
-__read_mostly = {
+static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
 	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
 	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
 	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
 	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
 
-static struct nla_policy
-cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
+static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
 	[CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
 };
 
diff --git a/kernel/time.c b/kernel/time.c
index 804798005d1..ba9b338d183 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,7 +35,6 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/fs.h>
-#include <linux/slab.h>
 #include <linux/math64.h>
 #include <linux/ptrace.h>
 
@@ -133,12 +132,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
  */
 static inline void warp_clock(void)
 {
-	write_seqlock_irq(&xtime_lock);
-	wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
-	xtime.tv_sec += sys_tz.tz_minuteswest * 60;
-	update_xtime_cache(0);
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
+	struct timespec adjust;
+
+	adjust = current_kernel_time();
+	adjust.tv_sec += sys_tz.tz_minuteswest * 60;
+	do_settimeofday(&adjust);
 }
 
 /*
@@ -302,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
 }
 EXPORT_SYMBOL(timespec_trunc);
 
-#ifndef CONFIG_GENERIC_TIME
-/*
- * Simulate gettimeofday using do_gettimeofday which only allows a timeval
- * and therefore only yields usec accuracy
- */
-void getnstimeofday(struct timespec *tv)
-{
-	struct timeval x;
-
-	do_gettimeofday(&x);
-	tv->tv_sec = x.tv_sec;
-	tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
-}
-EXPORT_SYMBOL_GPL(getnstimeofday);
-#endif
-
 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
  * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
  * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 95ed42951e0..f06a8a36564 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -6,7 +6,7 @@ config TICK_ONESHOT
 
 config NO_HZ
 	bool "Tickless System (Dynamic Ticks)"
-	depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
 	select TICK_ONESHOT
 	help
 	  This option enables a tickless system: timer interrupts will
@@ -15,7 +15,7 @@ config NO_HZ
 
 config HIGH_RES_TIMERS
 	bool "High Resolution Timer Support"
-	depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
 	select TICK_ONESHOT
 	help
 	  This option enables high resolution timer support. If your
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e85c23404d3..c18d7efa1b4 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&watchdog_lock, flags);
+	/*
+	 * We use trylock here to avoid a potential dead lock when
+	 * kgdb calls this code after the kernel has been stopped with
+	 * watchdog_lock held. When watchdog_lock is held we just
+	 * return and accept, that the watchdog might trigger and mark
+	 * the monitored clock source (usually TSC) unstable.
+	 *
+	 * This does not affect the other caller clocksource_resume()
+	 * because at this point the kernel is UP, interrupts are
+	 * disabled and nothing can hold watchdog_lock.
+	 */
+	if (!spin_trylock_irqsave(&watchdog_lock, flags))
+		return;
 	clocksource_reset_watchdog();
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
@@ -441,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; }
 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
 
 /**
+ * clocksource_suspend - suspend the clocksource(s)
+ */
+void clocksource_suspend(void)
+{
+	struct clocksource *cs;
+
+	list_for_each_entry_reverse(cs, &clocksource_list, list)
+		if (cs->suspend)
+			cs->suspend(cs);
+}
+
+/**
  * clocksource_resume - resume the clocksource(s)
  */
 void clocksource_resume(void)
@@ -449,7 +473,7 @@ void clocksource_resume(void)
 
 	list_for_each_entry(cs, &clocksource_list, list)
 		if (cs->resume)
-			cs->resume();
+			cs->resume(cs);
 
 	clocksource_resume_watchdog();
 }
@@ -458,8 +482,8 @@ void clocksource_resume(void)
  * clocksource_touch_watchdog - Update watchdog
  *
  * Update the watchdog after exception contexts such as kgdb so as not
- * to incorrectly trip the watchdog.
- *
+ * to incorrectly trip the watchdog. This might fail when the kernel
+ * was stopped in code which holds watchdog_lock.
  */
 void clocksource_touch_watchdog(void)
 {
@@ -507,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
 	return max_nsecs - (max_nsecs >> 5);
 }
 
-#ifdef CONFIG_GENERIC_TIME
+#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
 
 /**
  * clocksource_select - Select the best clocksource available
@@ -553,7 +577,7 @@ static void clocksource_select(void)
 	}
 }
 
-#else /* CONFIG_GENERIC_TIME */
+#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
 
 static inline void clocksource_select(void) { }
 
@@ -568,6 +592,10 @@ static inline void clocksource_select(void) { }
  */
 static int __init clocksource_done_booting(void)
 {
+	mutex_lock(&clocksource_mutex);
+	curr_clocksource = clocksource_default_clock();
+	mutex_unlock(&clocksource_mutex);
+
 	finished_booting = 1;
 
 	/*
@@ -597,6 +625,73 @@ static void clocksource_enqueue(struct clocksource *cs)
 	list_add(&cs->list, entry);
 }
 
+
+/*
+ * Maximum time we expect to go between ticks. This includes idle
+ * tickless time. It provides the trade off between selecting a
+ * mult/shift pair that is very precise but can only handle a short
+ * period of time, vs. a mult/shift pair that can handle long periods
+ * of time but isn't as precise.
+ *
+ * This is a subsystem constant, and actual hardware limitations
+ * may override it (ie: clocksources that wrap every 3 seconds).
+ */
+#define MAX_UPDATE_LENGTH 5 /* Seconds */
+
+/**
+ * __clocksource_updatefreq_scale - Used update clocksource with new freq
+ * @t:		clocksource to be registered
+ * @scale:	Scale factor multiplied against freq to get clocksource hz
+ * @freq:	clocksource frequency (cycles per second) divided by scale
+ *
+ * This should only be called from the clocksource->enable() method.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
+ */
+void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+	/*
+	 * Ideally we want to use  some of the limits used in
+	 * clocksource_max_deferment, to provide a more informed
+	 * MAX_UPDATE_LENGTH. But for now this just gets the
+	 * register interface working properly.
+	 */
+	clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+				      NSEC_PER_SEC/scale,
+				      MAX_UPDATE_LENGTH*scale);
+	cs->max_idle_ns = clocksource_max_deferment(cs);
+}
+EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+
+/**
+ * __clocksource_register_scale - Used to install new clocksources
+ * @t:		clocksource to be registered
+ * @scale:	Scale factor multiplied against freq to get clocksource hz
+ * @freq:	clocksource frequency (cycles per second) divided by scale
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * clocksource_register_hz() or clocksource_register_khz helper functions.
+ */
+int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+
+	/* Intialize mult/shift and max_idle_ns */
+	__clocksource_updatefreq_scale(cs, scale, freq);
+
+	/* Add clocksource to the clcoksource list */
+	mutex_lock(&clocksource_mutex);
+	clocksource_enqueue(cs);
+	clocksource_select();
+	clocksource_enqueue_watchdog(cs);
+	mutex_unlock(&clocksource_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__clocksource_register_scale);
+
+
 /**
  * clocksource_register - Used to install new clocksources
  * @t:		clocksource to be registered
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4800f933910..c63116863a8 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -58,10 +58,10 @@ static s64			time_offset;
 static long			time_constant = 2;
 
 /* maximum error (usecs):						*/
-long				time_maxerror = NTP_PHASE_LIMIT;
+static long			time_maxerror = NTP_PHASE_LIMIT;
 
 /* estimated error (usecs):						*/
-long				time_esterror = NTP_PHASE_LIMIT;
+static long			time_esterror = NTP_PHASE_LIMIT;
 
 /* frequency offset (scaled nsecs/secs):				*/
 static s64			time_freq;
@@ -69,7 +69,7 @@ static s64			time_freq;
 /* time at last adjustment (secs):					*/
 static long			time_reftime;
 
-long				time_adjust;
+static long			time_adjust;
 
 /* constant (boot-param configurable) NTP tick adjustment (upscaled)	*/
 static s64			ntp_tick_adj;
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
 	 * Select how the frequency is to be controlled
 	 * and in which mode (PLL or FLL).
 	 */
-	secs = xtime.tv_sec - time_reftime;
+	secs = get_seconds() - time_reftime;
 	if (unlikely(time_status & STA_FREQHOLD))
 		secs = 0;
 
-	time_reftime = xtime.tv_sec;
+	time_reftime = get_seconds();
 
 	offset64    = offset;
 	freq_adj    = (offset64 * secs) <<
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
 	 * reference time to current time.
 	 */
 	if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
-		time_reftime = xtime.tv_sec;
+		time_reftime = get_seconds();
 
 	/* only set allowed bits */
 	time_status &= STA_RONLY;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b3bafd5fc66..48b2761b566 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 	/*
 	 * Setup the next period for devices, which do not have
 	 * periodic mode. We read dev->next_event first and add to it
-	 * when the event alrady expired. clockevents_program_event()
+	 * when the event already expired. clockevents_program_event()
 	 * sets dev->next_event only when the event is really
 	 * programmed to the device.
 	 */
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0a8a213016f..aada0e52680 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
 
 #include "tick-internal.h"
 
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT		(NSEC_PER_SEC / HZ)
+
+static int tick_increase_min_delta(struct clock_event_device *dev)
+{
+	/* Nothing to do if we already reached the limit */
+	if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
+		return -ETIME;
+
+	if (dev->min_delta_ns < 5000)
+		dev->min_delta_ns = 5000;
+	else
+		dev->min_delta_ns += dev->min_delta_ns >> 1;
+
+	if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+		dev->min_delta_ns = MIN_DELTA_LIMIT;
+
+	printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+	       dev->name ? dev->name : "?",
+	       (unsigned long long) dev->min_delta_ns);
+	return 0;
+}
+
 /**
  * tick_program_event internal worker function
  */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
 		if (!ret || !force)
 			return ret;
 
+		dev->retries++;
 		/*
-		 * We tried 2 times to program the device with the given
-		 * min_delta_ns. If that's not working then we double it
+		 * We tried 3 times to program the device with the given
+		 * min_delta_ns. If that's not working then we increase it
 		 * and emit a warning.
 		 */
 		if (++i > 2) {
 			/* Increase the min. delta and try again */
-			if (!dev->min_delta_ns)
-				dev->min_delta_ns = 5000;
-			else
-				dev->min_delta_ns += dev->min_delta_ns >> 1;
-
-			printk(KERN_WARNING
-			       "CE: %s increasing min_delta_ns to %llu nsec\n",
-			       dev->name ? dev->name : "?",
-			       (unsigned long long) dev->min_delta_ns << 1);
-
+			if (tick_increase_min_delta(dev)) {
+				/*
+				 * Get out of the loop if min_delta_ns
+				 * hit the limit already. That's
+				 * better than staying here forever.
+				 *
+				 * We clear next_event so we have a
+				 * chance that the box survives.
+				 */
+				printk(KERN_WARNING
+				       "CE: Reprogramming failure. Giving up\n");
+				dev->next_event.tv64 = KTIME_MAX;
+				return -ETIME;
+			}
 			i = 0;
 		}
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f5..3e216e01bbd 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,35 +150,65 @@ static void tick_nohz_update_jiffies(ktime_t now)
 	touch_softlockup_watchdog();
 }
 
+/*
+ * Updates the per cpu time idle statistics counters
+ */
+static void
+update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+{
+	ktime_t delta;
+
+	if (ts->idle_active) {
+		delta = ktime_sub(now, ts->idle_entrytime);
+		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+		if (nr_iowait_cpu(cpu) > 0)
+			ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+		ts->idle_entrytime = now;
+	}
+
+	if (last_update_time)
+		*last_update_time = ktime_to_us(now);
+
+}
+
 static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-	ktime_t delta;
 
-	delta = ktime_sub(now, ts->idle_entrytime);
-	ts->idle_lastupdate = now;
-	ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+	update_ts_time_stats(cpu, ts, now, NULL);
 	ts->idle_active = 0;
 
 	sched_clock_idle_wakeup_event(0);
 }
 
-static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 {
-	ktime_t now, delta;
+	ktime_t now;
 
 	now = ktime_get();
-	if (ts->idle_active) {
-		delta = ktime_sub(now, ts->idle_entrytime);
-		ts->idle_lastupdate = now;
-		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-	}
+
+	update_ts_time_stats(cpu, ts, now, NULL);
+
 	ts->idle_entrytime = now;
 	ts->idle_active = 1;
 	sched_clock_idle_sleep_event();
 	return now;
 }
 
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds. The idle time returned includes
+ * the iowait time (unlike what "top" and co report).
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 	if (!tick_nohz_enabled)
 		return -1;
 
-	if (ts->idle_active)
-		*last_update_time = ktime_to_us(ts->idle_lastupdate);
-	else
-		*last_update_time = ktime_to_us(ktime_get());
+	update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
 
 	return ktime_to_us(ts->idle_sleeptime);
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
+/*
+ * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative iowait time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+	if (!tick_nohz_enabled)
+		return -1;
+
+	update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
+
+	return ktime_to_us(ts->iowait_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+
 /**
  * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
  *
@@ -231,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 	 */
 	ts->inidle = 1;
 
-	now = tick_nohz_start_idle(ts);
+	now = tick_nohz_start_idle(cpu, ts);
 
 	/*
 	 * If this cpu is offline and it is the one which updates
@@ -352,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 		 * the scheduler tick in nohz_restart_sched_tick.
 		 */
 		if (!ts->tick_stopped) {
-			if (select_nohz_load_balancer(1)) {
-				/*
-				 * sched tick not stopped!
-				 */
-				cpumask_clear_cpu(cpu, nohz_cpu_mask);
-				goto out;
-			}
+			select_nohz_load_balancer(1);
 
 			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
 			ts->tick_stopped = 1;
@@ -727,7 +774,6 @@ void tick_setup_sched_timer(void)
 {
 	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 	ktime_t now = ktime_get();
-	u64 offset;
 
 	/*
 	 * Emulate tick processing via per-CPU hrtimers:
@@ -737,10 +783,6 @@ void tick_setup_sched_timer(void)
 
 	/* Get the next period (per cpu) */
 	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
-	offset = ktime_to_ns(tick_period) >> 1;
-	do_div(offset, num_possible_cpus());
-	offset *= smp_processor_id();
-	hrtimer_add_expires_ns(&ts->sched_timer, offset);
 
 	for (;;) {
 		hrtimer_forward(&ts->sched_timer, now, tick_period);
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 12f5c55090b..ac38fbb176c 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -19,6 +19,7 @@
 
 #include <linux/timecompare.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/math64.h>
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7faaa32fbf4..49010d822f7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
  * - wall_to_monotonic is no longer the boot time, getboottime must be
  * used instead.
  */
-struct timespec xtime __attribute__ ((aligned (16)));
-struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+static struct timespec xtime __attribute__ ((aligned (16)));
+static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 static struct timespec total_sleep_time;
 
 /*
@@ -165,23 +165,15 @@ struct timespec raw_time;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
 
-static struct timespec xtime_cache __attribute__ ((aligned (16)));
-void update_xtime_cache(u64 nsec)
-{
-	xtime_cache = xtime;
-	timespec_add_ns(&xtime_cache, nsec);
-}
-
 /* must hold xtime_lock */
 void timekeeping_leap_insert(int leapsecond)
 {
 	xtime.tv_sec += leapsecond;
 	wall_to_monotonic.tv_sec -= leapsecond;
-	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+			timekeeper.mult);
 }
 
-#ifdef CONFIG_GENERIC_TIME
-
 /**
  * timekeeping_forward_now - update clock to the current time
  *
@@ -332,12 +324,11 @@ int do_settimeofday(struct timespec *tv)
 
 	xtime = *tv;
 
-	update_xtime_cache(0);
-
 	timekeeper.ntp_error = 0;
 	ntp_clear();
 
-	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+				timekeeper.mult);
 
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
@@ -385,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock)
 	tick_clock_notify();
 }
 
-#else /* GENERIC_TIME */
-
-static inline void timekeeping_forward_now(void) { }
-
-/**
- * ktime_get - get the monotonic time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get(void)
-{
-	struct timespec now;
-
-	ktime_get_ts(&now);
-
-	return timespec_to_ktime(now);
-}
-EXPORT_SYMBOL_GPL(ktime_get);
-
-/**
- * ktime_get_ts - get the monotonic clock in timespec format
- * @ts:		pointer to timespec variable
- *
- * The function calculates the monotonic clock from the realtime
- * clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
- */
-void ktime_get_ts(struct timespec *ts)
-{
-	struct timespec tomono;
-	unsigned long seq;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		getnstimeofday(ts);
-		tomono = wall_to_monotonic;
-
-	} while (read_seqretry(&xtime_lock, seq));
-
-	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
-				ts->tv_nsec + tomono.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(ktime_get_ts);
-
-#endif /* !GENERIC_TIME */
-
 /**
  * ktime_get_real - get the real (wall-) time in ktime_t format
  *
@@ -559,7 +504,6 @@ void __init timekeeping_init(void)
 	}
 	set_normalized_timespec(&wall_to_monotonic,
 				-boot.tv_sec, -boot.tv_nsec);
-	update_xtime_cache(0);
 	total_sleep_time.tv_sec = 0;
 	total_sleep_time.tv_nsec = 0;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -589,11 +533,10 @@ static int timekeeping_resume(struct sys_device *dev)
 
 	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
 		ts = timespec_sub(ts, timekeeping_suspend_time);
-		xtime = timespec_add_safe(xtime, ts);
+		xtime = timespec_add(xtime, ts);
 		wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
-		total_sleep_time = timespec_add_safe(total_sleep_time, ts);
+		total_sleep_time = timespec_add(total_sleep_time, ts);
 	}
-	update_xtime_cache(0);
 	/* re-base the last cycle value */
 	timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
 	timekeeper.ntp_error = 0;
@@ -622,6 +565,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
 	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+	clocksource_suspend();
 
 	return 0;
 }
@@ -746,6 +690,7 @@ static void timekeeping_adjust(s64 offset)
 static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
 {
 	u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+	u64 raw_nsecs;
 
 	/* If the offset is smaller then a shifted interval, do nothing */
 	if (offset < timekeeper.cycle_interval<<shift)
@@ -762,12 +707,15 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
 		second_overflow();
 	}
 
-	/* Accumulate into raw time */
-	raw_time.tv_nsec += timekeeper.raw_interval << shift;;
-	while (raw_time.tv_nsec >= NSEC_PER_SEC) {
-		raw_time.tv_nsec -= NSEC_PER_SEC;
-		raw_time.tv_sec++;
+	/* Accumulate raw time */
+	raw_nsecs = timekeeper.raw_interval << shift;
+	raw_nsecs += raw_time.tv_nsec;
+	if (raw_nsecs >= NSEC_PER_SEC) {
+		u64 raw_secs = raw_nsecs;
+		raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
+		raw_time.tv_sec += raw_secs;
 	}
+	raw_time.tv_nsec = raw_nsecs;
 
 	/* Accumulate error between NTP and clock interval */
 	timekeeper.ntp_error += tick_length << shift;
@@ -787,7 +735,6 @@ void update_wall_time(void)
 {
 	struct clocksource *clock;
 	cycle_t offset;
-	u64 nsecs;
 	int shift = 0, maxshift;
 
 	/* Make sure we're fully resumed: */
@@ -795,10 +742,11 @@ void update_wall_time(void)
 		return;
 
 	clock = timekeeper.clock;
-#ifdef CONFIG_GENERIC_TIME
-	offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
-#else
+
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
 	offset = timekeeper.cycle_interval;
+#else
+	offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
 	timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
 
@@ -817,7 +765,8 @@ void update_wall_time(void)
 	shift = min(shift, maxshift);
 	while (offset >= timekeeper.cycle_interval) {
 		offset = logarithmic_accumulation(offset, shift);
-		shift--;
+		if(offset < timekeeper.cycle_interval<<shift)
+			shift--;
 	}
 
 	/* correct the clock when NTP error is too big */
@@ -845,7 +794,9 @@ void update_wall_time(void)
 		timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
 	}
 
-	/* store full nanoseconds into xtime after rounding it up and
+
+	/*
+	 * Store full nanoseconds into xtime after rounding it up and
 	 * add the remainder to the error difference.
 	 */
 	xtime.tv_nsec =	((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
@@ -853,11 +804,19 @@ void update_wall_time(void)
 	timekeeper.ntp_error +=	timekeeper.xtime_nsec <<
 				timekeeper.ntp_error_shift;
 
-	nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
-	update_xtime_cache(nsecs);
+	/*
+	 * Finally, make sure that after the rounding
+	 * xtime.tv_nsec isn't larger then NSEC_PER_SEC
+	 */
+	if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
+		xtime.tv_nsec -= NSEC_PER_SEC;
+		xtime.tv_sec++;
+		second_overflow();
+	}
 
 	/* check to see if there is a new clocksource to use */
-	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+				timekeeper.mult);
 }
 
 /**
@@ -880,6 +839,7 @@ void getboottime(struct timespec *ts)
 
 	set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
+EXPORT_SYMBOL_GPL(getboottime);
 
 /**
  * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -887,18 +847,24 @@ void getboottime(struct timespec *ts)
  */
 void monotonic_to_bootbased(struct timespec *ts)
 {
-	*ts = timespec_add_safe(*ts, total_sleep_time);
+	*ts = timespec_add(*ts, total_sleep_time);
 }
+EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 
 unsigned long get_seconds(void)
 {
-	return xtime_cache.tv_sec;
+	return xtime.tv_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 
 struct timespec __current_kernel_time(void)
 {
-	return xtime_cache;
+	return xtime;
+}
+
+struct timespec __get_wall_to_monotonic(void)
+{
+	return wall_to_monotonic;
 }
 
 struct timespec current_kernel_time(void)
@@ -909,7 +875,7 @@ struct timespec current_kernel_time(void)
 	do {
 		seq = read_seqbegin(&xtime_lock);
 
-		now = xtime_cache;
+		now = xtime;
 	} while (read_seqretry(&xtime_lock, seq));
 
 	return now;
@@ -924,7 +890,7 @@ struct timespec get_monotonic_coarse(void)
 	do {
 		seq = read_seqbegin(&xtime_lock);
 
-		now = xtime_cache;
+		now = xtime;
 		mono = wall_to_monotonic;
 	} while (read_seqretry(&xtime_lock, seq));
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdfb8dd1050..ab8f5e33fa9 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 		P_ns(idle_waketime);
 		P_ns(idle_exittime);
 		P_ns(idle_sleeptime);
+		P_ns(iowait_sleeptime);
 		P(last_jiffies);
 		P(next_jiffies);
 		P_ns(idle_expires);
@@ -228,6 +229,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 	SEQ_printf(m, " event_handler:  ");
 	print_name_offset(m, dev->event_handler);
 	SEQ_printf(m, "\n");
+	SEQ_printf(m, " retries:        %lu\n", dev->retries);
 }
 
 static void timer_list_show_tickdevices(struct seq_file *m)
@@ -257,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Timer List Version: v0.5\n");
+	SEQ_printf(m, "Timer List Version: v0.6\n");
 	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
 	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index c61a7949387..97bf05baade 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
 #include <linux/kallsyms.h>
 #include <linux/perf_event.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -89,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 
 /*
  * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB for
- * the new flag to indicate whether the timer is deferrable
+ * base in timer_list is guaranteed to be zero. Use the LSB to
+ * indicate whether the timer is deferrable.
+ *
+ * A deferrable timer will work normally when the system is busy, but
+ * will not cause a CPU to come out of idle just to service it; instead,
+ * the timer will be serviced when the CPU eventually wakes up with a
+ * subsequent non-deferrable timer.
  */
 #define TBASE_DEFERRABLE_FLAG		(0x1)
 
@@ -318,6 +324,25 @@ unsigned long round_jiffies_up_relative(unsigned long j)
 }
 EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
 
+/**
+ * set_timer_slack - set the allowed slack for a timer
+ * @timer: the timer to be modified
+ * @slack_hz: the amount of time (in jiffies) allowed for rounding
+ *
+ * Set the amount of time, in jiffies, that a certain timer has
+ * in terms of slack. By setting this value, the timer subsystem
+ * will schedule the actual timer somewhere between
+ * the time mod_timer() asks for, and that time plus the slack.
+ *
+ * By setting the slack to -1, a percentage of the delay is used
+ * instead.
+ */
+void set_timer_slack(struct timer_list *timer, int slack_hz)
+{
+	timer->slack = slack_hz;
+}
+EXPORT_SYMBOL_GPL(set_timer_slack);
+
 
 static inline void set_running_timer(struct tvec_base *base,
 					struct timer_list *timer)
@@ -549,6 +574,7 @@ static void __init_timer(struct timer_list *timer,
 {
 	timer->entry.next = NULL;
 	timer->base = __raw_get_cpu_var(tvec_bases);
+	timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
 	timer->start_site = NULL;
 	timer->start_pid = -1;
@@ -557,6 +583,19 @@ static void __init_timer(struct timer_list *timer,
 	lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
 
+void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
+					 const char *name,
+					 struct lock_class_key *key,
+					 void (*function)(unsigned long),
+					 unsigned long data)
+{
+	timer->function = function;
+	timer->data = data;
+	init_timer_on_stack_key(timer, name, key);
+	timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
+
 /**
  * init_timer_key - initialize a timer
  * @timer: the timer to be initialized
@@ -659,12 +698,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 	cpu = smp_processor_id();
 
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
-		int preferred_cpu = get_nohz_load_balancer();
-
-		if (preferred_cpu >= 0)
-			cpu = preferred_cpu;
-	}
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
+		cpu = get_nohz_timer_target();
 #endif
 	new_base = per_cpu(tvec_bases, cpu);
 
@@ -714,6 +749,46 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
+/*
+ * Decide where to put the timer while taking the slack into account
+ *
+ * Algorithm:
+ *   1) calculate the maximum (absolute) time
+ *   2) calculate the highest bit where the expires and new max are different
+ *   3) use this bit to make a mask
+ *   4) use the bitmask to round down the maximum time, so that all last
+ *      bits are zeros
+ */
+static inline
+unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
+{
+	unsigned long expires_limit, mask;
+	int bit;
+
+	expires_limit = expires;
+
+	if (timer->slack >= 0) {
+		expires_limit = expires + timer->slack;
+	} else {
+		unsigned long now = jiffies;
+
+		/* No slack, if already expired else auto slack 0.4% */
+		if (time_after(expires, now))
+			expires_limit = expires + (expires - now)/256;
+	}
+	mask = expires ^ expires_limit;
+	if (mask == 0)
+		return expires;
+
+	bit = find_last_bit(&mask, BITS_PER_LONG);
+
+	mask = (1 << bit) - 1;
+
+	expires_limit = expires_limit & ~(mask);
+
+	return expires_limit;
+}
+
 /**
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
@@ -744,6 +819,8 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 	if (timer_pending(timer) && timer->expires == expires)
 		return 1;
 
+	expires = apply_slack(timer, expires);
+
 	return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
@@ -880,6 +957,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
 	if (base->running_timer == timer)
 		goto out;
 
+	timer_stats_timer_clear_start_info(timer);
 	ret = 0;
 	if (timer_pending(timer)) {
 		detach_timer(timer, 1);
@@ -953,6 +1031,47 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 	return index;
 }
 
+static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
+			  unsigned long data)
+{
+	int preempt_count = preempt_count();
+
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * It is permissible to free the timer from inside the
+	 * function that is called from it, this we need to take into
+	 * account for lockdep too. To avoid bogus "held lock freed"
+	 * warnings as well as problems when looking into
+	 * timer->lockdep_map, make a copy and use that here.
+	 */
+	struct lockdep_map lockdep_map = timer->lockdep_map;
+#endif
+	/*
+	 * Couple the lock chain with the lock chain at
+	 * del_timer_sync() by acquiring the lock_map around the fn()
+	 * call here and in del_timer_sync().
+	 */
+	lock_map_acquire(&lockdep_map);
+
+	trace_timer_expire_entry(timer);
+	fn(data);
+	trace_timer_expire_exit(timer);
+
+	lock_map_release(&lockdep_map);
+
+	if (preempt_count != preempt_count()) {
+		WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
+			  fn, preempt_count, preempt_count());
+		/*
+		 * Restore the preempt count. That gives us a decent
+		 * chance to survive and extract information. If the
+		 * callback kept a lock held, bad luck, but not worse
+		 * than the BUG() we had.
+		 */
+		preempt_count() = preempt_count;
+	}
+}
+
 #define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 
 /**
@@ -996,45 +1115,7 @@ static inline void __run_timers(struct tvec_base *base)
 			detach_timer(timer, 1);
 
 			spin_unlock_irq(&base->lock);
-			{
-				int preempt_count = preempt_count();
-
-#ifdef CONFIG_LOCKDEP
-				/*
-				 * It is permissible to free the timer from
-				 * inside the function that is called from
-				 * it, this we need to take into account for
-				 * lockdep too. To avoid bogus "held lock
-				 * freed" warnings as well as problems when
-				 * looking into timer->lockdep_map, make a
-				 * copy and use that here.
-				 */
-				struct lockdep_map lockdep_map =
-					timer->lockdep_map;
-#endif
-				/*
-				 * Couple the lock chain with the lock chain at
-				 * del_timer_sync() by acquiring the lock_map
-				 * around the fn() call here and in
-				 * del_timer_sync().
-				 */
-				lock_map_acquire(&lockdep_map);
-
-				trace_timer_expire_entry(timer);
-				fn(data);
-				trace_timer_expire_exit(timer);
-
-				lock_map_release(&lockdep_map);
-
-				if (preempt_count != preempt_count()) {
-					printk(KERN_ERR "huh, entered %p "
-					       "with preempt_count %08x, exited"
-					       " with %08x?\n",
-					       fn, preempt_count,
-					       preempt_count());
-					BUG();
-				}
-			}
+			call_timer_fn(timer, fn, data);
 			spin_lock_irq(&base->lock);
 		}
 	}
@@ -1223,7 +1304,6 @@ void run_local_timers(void)
 {
 	hrtimer_run_queues();
 	raise_softirq(TIMER_SOFTIRQ);
-	softlockup_tick();
 }
 
 /*
@@ -1618,11 +1698,14 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
+	int err;
+
 	switch(action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		if (init_timers_cpu(cpu) < 0)
-			return NOTIFY_BAD;
+		err = init_timers_cpu(cpu);
+		if (err < 0)
+			return notifier_from_errno(err);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
@@ -1648,7 +1731,7 @@ void __init init_timers(void)
 
 	init_timer_stats();
 
-	BUG_ON(err == NOTIFY_BAD);
+	BUG_ON(err != NOTIFY_OK);
 	register_cpu_notifier(&timers_nb);
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
@@ -1681,3 +1764,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
+
+static int __sched do_usleep_range(unsigned long min, unsigned long max)
+{
+	ktime_t kmin;
+	unsigned long delta;
+
+	kmin = ktime_set(0, min * NSEC_PER_USEC);
+	delta = (max - min) * NSEC_PER_USEC;
+	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+}
+
+/**
+ * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ */
+void usleep_range(unsigned long min, unsigned long max)
+{
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	do_usleep_range(min, max);
+}
+EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6c22d8a2f28..538501c6ea5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER
 config HAVE_FUNCTION_GRAPH_FP_TEST
 	bool
 	help
-	 An arch may pass in a unique value (frame pointer) to both the
-	 entering and exiting of a function. On exit, the value is compared
-	 and if it does not match, then it will panic the kernel.
+	  See Documentation/trace/ftrace-design.txt
 
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	bool
@@ -46,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
 	help
 	  See Documentation/trace/ftrace-design.txt
 
-config HAVE_HW_BRANCH_TRACER
-	bool
-
 config HAVE_SYSCALL_TRACEPOINTS
 	bool
 	help
@@ -158,7 +153,7 @@ config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
 	depends on TRACE_IRQFLAGS_SUPPORT
-	depends on GENERIC_TIME
+	depends on !ARCH_USES_GETTIMEOFFSET
 	select TRACE_IRQFLAGS
 	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
@@ -180,7 +175,7 @@ config IRQSOFF_TRACER
 config PREEMPT_TRACER
 	bool "Preemption-off Latency Tracer"
 	default n
-	depends on GENERIC_TIME
+	depends on !ARCH_USES_GETTIMEOFFSET
 	depends on PREEMPT
 	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
@@ -199,15 +194,6 @@ config PREEMPT_TRACER
 	  enabled. This option and the irqs-off timing option can be
 	  used together or separately.)
 
-config SYSPROF_TRACER
-	bool "Sysprof Tracer"
-	depends on X86
-	select GENERIC_TRACER
-	select CONTEXT_SWITCH_TRACER
-	help
-	  This tracer provides the trace needed by the 'Sysprof' userspace
-	  tool.
-
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	select GENERIC_TRACER
@@ -234,23 +220,6 @@ config FTRACE_SYSCALLS
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 
-config BOOT_TRACER
-	bool "Trace boot initcalls"
-	select GENERIC_TRACER
-	select CONTEXT_SWITCH_TRACER
-	help
-	  This tracer helps developers to optimize boot times: it records
-	  the timings of the initcalls and traces key events and the identity
-	  of tasks that can cause boot delays, such as context-switches.
-
-	  Its aim is to be parsed by the scripts/bootgraph.pl tool to
-	  produce pretty graphics about boot inefficiencies, giving a visual
-	  representation of the delays during initcalls - but the raw
-	  /debug/tracing/trace text output is readable too.
-
-	  You must pass in initcall_debug and ftrace=initcall to the kernel
-	  command line to enable this on bootup.
-
 config TRACE_BRANCH_PROFILING
 	bool
 	select GENERIC_TRACER
@@ -330,37 +299,6 @@ config BRANCH_TRACER
 
 	  Say N if unsure.
 
-config POWER_TRACER
-	bool "Trace power consumption behavior"
-	depends on X86
-	select GENERIC_TRACER
-	help
-	  This tracer helps developers to analyze and optimize the kernel's
-	  power management decisions, specifically the C-state and P-state
-	  behavior.
-
-config KSYM_TRACER
-	bool "Trace read and write access on kernel memory locations"
-	depends on HAVE_HW_BREAKPOINT
-	select TRACING
-	help
-	  This tracer helps find read and write operations on any given kernel
-	  symbol i.e. /proc/kallsyms.
-
-config PROFILE_KSYM_TRACER
-	bool "Profile all kernel memory accesses on 'watched' variables"
-	depends on KSYM_TRACER
-	help
-	  This tracer profiles kernel accesses on variables watched through the
-	  ksym tracer ftrace plugin. Depending upon the hardware, all read
-	  and write operations on kernel variables can be monitored for
-	  accesses.
-
-	  The results will be displayed in:
-	  /debugfs/tracing/profile_ksym
-
-	  Say N if unsure.
-
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
@@ -385,45 +323,6 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
-config HW_BRANCH_TRACER
-	depends on HAVE_HW_BRANCH_TRACER
-	bool "Trace hw branches"
-	select GENERIC_TRACER
-	help
-	  This tracer records all branches on the system in a circular
-	  buffer, giving access to the last N branches for each cpu.
-
-config KMEMTRACE
-	bool "Trace SLAB allocations"
-	select GENERIC_TRACER
-	help
-	  kmemtrace provides tracing for slab allocator functions, such as
-	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
-	  data is then fed to the userspace application in order to analyse
-	  allocation hotspots, internal fragmentation and so on, making it
-	  possible to see how well an allocator performs, as well as debug
-	  and profile kernel code.
-
-	  This requires an userspace application to use. See
-	  Documentation/trace/kmemtrace.txt for more information.
-
-	  Saying Y will make the kernel somewhat larger and slower. However,
-	  if you disable kmemtrace at run-time or boot-time, the performance
-	  impact is minimal (depending on the arch the kernel is built for).
-
-	  If unsure, say N.
-
-config WORKQUEUE_TRACER
-	bool "Trace workqueues"
-	select GENERIC_TRACER
-	help
-	  The workqueue tracer provides some statistical information
-          about each cpu workqueue thread such as the number of the
-          works inserted and executed since their creation. It can help
-          to evaluate the amount of work each of them has to perform.
-          For example it can help a developer to decide whether he should
-          choose a per-cpu workqueue instead of a singlethreaded one.
-
 config BLK_DEV_IO_TRACE
 	bool "Support for tracing block IO actions"
 	depends on SYSFS
@@ -451,7 +350,7 @@ config BLK_DEV_IO_TRACE
 
 config KPROBE_EVENT
 	depends on KPROBES
-	depends on X86
+	depends on HAVE_REGS_AND_STACK_ACCESS_API
 	bool "Enable kprobes-based dynamic events"
 	select TRACING
 	default y
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd89ec7..53f338190b2 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
-obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
@@ -38,11 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
-obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
-obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
@@ -51,10 +47,14 @@ endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
-obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
+endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+ifeq ($(CONFIG_TRACING),y)
+obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
+endif
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d9d6206e0b1..959f8d6c8cc 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -21,6 +21,7 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/debugfs.h>
 #include <linux/smp_lock.h>
 #include <linux/time.h>
@@ -168,9 +169,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
 				 BLK_TC_ACT(BLK_TC_WRITE) };
 
+#define BLK_TC_HARDBARRIER	BLK_TC_BARRIER
+#define BLK_TC_RAHEAD		BLK_TC_AHEAD
+
 /* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
-	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
+#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
+	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
 
 /*
  * The worker for the various blk_add_trace*() types. Fills out a
@@ -193,9 +197,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		return;
 
 	what |= ddir_act[rw & WRITE];
-	what |= MASK_TC_BIT(rw, BARRIER);
-	what |= MASK_TC_BIT(rw, SYNCIO);
-	what |= MASK_TC_BIT(rw, AHEAD);
+	what |= MASK_TC_BIT(rw, HARDBARRIER);
+	what |= MASK_TC_BIT(rw, SYNC);
+	what |= MASK_TC_BIT(rw, RAHEAD);
 	what |= MASK_TC_BIT(rw, META);
 	what |= MASK_TC_BIT(rw, DISCARD);
 
@@ -540,13 +544,49 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (ret)
 		return ret;
 
-	if (copy_to_user(arg, &buts, sizeof(buts)))
+	if (copy_to_user(arg, &buts, sizeof(buts))) {
+		blk_trace_remove(q);
 		return -EFAULT;
-
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_trace_setup);
 
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+static int compat_blk_trace_setup(struct request_queue *q, char *name,
+				  dev_t dev, struct block_device *bdev,
+				  char __user *arg)
+{
+	struct blk_user_trace_setup buts;
+	struct compat_blk_user_trace_setup cbuts;
+	int ret;
+
+	if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
+		return -EFAULT;
+
+	buts = (struct blk_user_trace_setup) {
+		.act_mask = cbuts.act_mask,
+		.buf_size = cbuts.buf_size,
+		.buf_nr = cbuts.buf_nr,
+		.start_lba = cbuts.start_lba,
+		.end_lba = cbuts.end_lba,
+		.pid = cbuts.pid,
+	};
+	memcpy(&buts.name, &cbuts.name, 32);
+
+	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(arg, &buts.name, 32)) {
+		blk_trace_remove(q);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+#endif
+
 int blk_trace_startstop(struct request_queue *q, int start)
 {
 	int ret;
@@ -599,6 +639,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	if (!q)
 		return -ENXIO;
 
+	lock_kernel();
 	mutex_lock(&bdev->bd_mutex);
 
 	switch (cmd) {
@@ -606,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 		bdevname(bdev, b);
 		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+	case BLKTRACESETUP32:
+		bdevname(bdev, b);
+		ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+		break;
+#endif
 	case BLKTRACESTART:
 		start = 1;
 	case BLKTRACESTOP:
@@ -620,6 +667,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	}
 
 	mutex_unlock(&bdev->bd_mutex);
+	unlock_kernel();
 	return ret;
 }
 
@@ -659,10 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (likely(!bt))
 		return;
 
-	if (blk_discard_rq(rq))
-		rw |= (1 << BIO_RW_DISCARD);
+	if (rq->cmd_flags & REQ_DISCARD)
+		rw |= REQ_DISCARD;
 
-	if (blk_pc_request(rq)) {
+	if (rq->cmd_flags & REQ_SECURE)
+		rw |= REQ_SECURE;
+
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
 		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
 				what, rq->errors, rq->cmd_len, rq->cmd);
@@ -673,28 +724,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	}
 }
 
-static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_abort(void *ignore,
+				   struct request_queue *q, struct request *rq)
 {
 	blk_add_trace_rq(q, rq, BLK_TA_ABORT);
 }
 
-static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_insert(void *ignore,
+				    struct request_queue *q, struct request *rq)
 {
 	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
 }
 
-static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_issue(void *ignore,
+				   struct request_queue *q, struct request *rq)
 {
 	blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 }
 
-static void blk_add_trace_rq_requeue(struct request_queue *q,
+static void blk_add_trace_rq_requeue(void *ignore,
+				     struct request_queue *q,
 				     struct request *rq)
 {
 	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
 }
 
-static void blk_add_trace_rq_complete(struct request_queue *q,
+static void blk_add_trace_rq_complete(void *ignore,
+				      struct request_queue *q,
 				      struct request *rq)
 {
 	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -722,34 +778,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 			!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
 }
 
-static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_bounce(void *ignore,
+				     struct request_queue *q, struct bio *bio)
 {
 	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
 }
 
-static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_complete(void *ignore,
+				       struct request_queue *q, struct bio *bio)
 {
 	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
 }
 
-static void blk_add_trace_bio_backmerge(struct request_queue *q,
+static void blk_add_trace_bio_backmerge(void *ignore,
+					struct request_queue *q,
 					struct bio *bio)
 {
 	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
 }
 
-static void blk_add_trace_bio_frontmerge(struct request_queue *q,
+static void blk_add_trace_bio_frontmerge(void *ignore,
+					 struct request_queue *q,
 					 struct bio *bio)
 {
 	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
 }
 
-static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_queue(void *ignore,
+				    struct request_queue *q, struct bio *bio)
 {
 	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
 }
 
-static void blk_add_trace_getrq(struct request_queue *q,
+static void blk_add_trace_getrq(void *ignore,
+				struct request_queue *q,
 				struct bio *bio, int rw)
 {
 	if (bio)
@@ -763,7 +825,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
 }
 
 
-static void blk_add_trace_sleeprq(struct request_queue *q,
+static void blk_add_trace_sleeprq(void *ignore,
+				  struct request_queue *q,
 				  struct bio *bio, int rw)
 {
 	if (bio)
@@ -777,7 +840,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
 	}
 }
 
-static void blk_add_trace_plug(struct request_queue *q)
+static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 {
 	struct blk_trace *bt = q->blk_trace;
 
@@ -785,7 +848,7 @@ static void blk_add_trace_plug(struct request_queue *q)
 		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
 
-static void blk_add_trace_unplug_io(struct request_queue *q)
+static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
 {
 	struct blk_trace *bt = q->blk_trace;
 
@@ -798,7 +861,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
 	}
 }
 
-static void blk_add_trace_unplug_timer(struct request_queue *q)
+static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
 {
 	struct blk_trace *bt = q->blk_trace;
 
@@ -811,7 +874,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
 	}
 }
 
-static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+static void blk_add_trace_split(void *ignore,
+				struct request_queue *q, struct bio *bio,
 				unsigned int pdu)
 {
 	struct blk_trace *bt = q->blk_trace;
@@ -827,6 +891,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
 
 /**
  * blk_add_trace_remap - Add a trace for a remap operation
+ * @ignore:	trace callback data parameter (not used)
  * @q:		queue the io is for
  * @bio:	the source bio
  * @dev:	target device
@@ -837,8 +902,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  *     it spans a stripe (or similar). Add a trace for that action.
  *
  **/
-static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from)
+static void blk_add_trace_remap(void *ignore,
+				struct request_queue *q, struct bio *bio,
+				dev_t dev, sector_t from)
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
@@ -857,6 +923,7 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 
 /**
  * blk_add_trace_rq_remap - Add a trace for a request-remap operation
+ * @ignore:	trace callback data parameter (not used)
  * @q:		queue the io is for
  * @rq:		the source request
  * @dev:	target device
@@ -867,7 +934,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
  *     Add a trace for that action.
  *
  **/
-static void blk_add_trace_rq_remap(struct request_queue *q,
+static void blk_add_trace_rq_remap(void *ignore,
+				   struct request_queue *q,
 				   struct request *rq, dev_t dev,
 				   sector_t from)
 {
@@ -906,7 +974,7 @@ void blk_add_driver_data(struct request_queue *q,
 	if (likely(!bt))
 		return;
 
-	if (blk_pc_request(rq))
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
 		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
 				BLK_TA_DRV_DATA, rq->errors, len, data);
 	else
@@ -919,64 +987,64 @@ static void blk_register_tracepoints(void)
 {
 	int ret;
 
-	ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+	ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_getrq(blk_add_trace_getrq);
+	ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_plug(blk_add_trace_plug);
+	ret = register_trace_block_plug(blk_add_trace_plug, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_split(blk_add_trace_split);
+	ret = register_trace_block_split(blk_add_trace_split, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_remap(blk_add_trace_remap);
+	ret = register_trace_block_remap(blk_add_trace_remap, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
+	ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
 	WARN_ON(ret);
 }
 
 static void blk_unregister_tracepoints(void)
 {
-	unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
-	unregister_trace_block_remap(blk_add_trace_remap);
-	unregister_trace_block_split(blk_add_trace_split);
-	unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
-	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
-	unregister_trace_block_plug(blk_add_trace_plug);
-	unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
-	unregister_trace_block_getrq(blk_add_trace_getrq);
-	unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
-	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
-	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
-	unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
-	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
-	unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
-	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
-	unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
-	unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
-	unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+	unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
+	unregister_trace_block_remap(blk_add_trace_remap, NULL);
+	unregister_trace_block_split(blk_add_trace_split, NULL);
+	unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
+	unregister_trace_block_plug(blk_add_trace_plug, NULL);
+	unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
+	unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
+	unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
+	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
+	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
+	unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
+	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
+	unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
+	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
+	unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
+	unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
+	unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
 
 	tracepoint_synchronize_unregister();
 }
@@ -1319,7 +1387,7 @@ out:
 }
 
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
-					       int flags)
+					       int flags, struct trace_event *event)
 {
 	return print_one_line(iter, false);
 }
@@ -1341,7 +1409,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 }
 
 static enum print_line_t
-blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
+			     struct trace_event *event)
 {
 	return blk_trace_synthesize_old_trace(iter) ?
 			TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1379,12 +1448,16 @@ static struct tracer blk_tracer __read_mostly = {
 	.set_flag	= blk_tracer_set_flag,
 };
 
-static struct trace_event trace_blk_event = {
-	.type		= TRACE_BLK,
+static struct trace_event_functions trace_blk_event_funcs = {
 	.trace		= blk_trace_event_print,
 	.binary		= blk_trace_event_print_binary,
 };
 
+static struct trace_event trace_blk_event = {
+	.type		= TRACE_BLK,
+	.funcs		= &trace_blk_event_funcs,
+};
+
 static int __init init_blk_tracer(void)
 {
 	if (!register_ftrace_event(&trace_blk_event)) {
@@ -1706,7 +1779,7 @@ void blk_dump_cmd(char *buf, struct request *rq)
 	int len = rq->cmd_len;
 	unsigned char *cmd = rq->cmd;
 
-	if (!blk_pc_request(rq)) {
+	if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
 		buf[0] = '\0';
 		return;
 	}
@@ -1731,21 +1804,23 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 
 	if (rw & WRITE)
 		rwbs[i++] = 'W';
-	else if (rw & 1 << BIO_RW_DISCARD)
+	else if (rw & REQ_DISCARD)
 		rwbs[i++] = 'D';
 	else if (bytes)
 		rwbs[i++] = 'R';
 	else
 		rwbs[i++] = 'N';
 
-	if (rw & 1 << BIO_RW_AHEAD)
+	if (rw & REQ_RAHEAD)
 		rwbs[i++] = 'A';
-	if (rw & 1 << BIO_RW_BARRIER)
+	if (rw & REQ_HARDBARRIER)
 		rwbs[i++] = 'B';
-	if (rw & 1 << BIO_RW_SYNCIO)
+	if (rw & REQ_SYNC)
 		rwbs[i++] = 'S';
-	if (rw & 1 << BIO_RW_META)
+	if (rw & REQ_META)
 		rwbs[i++] = 'M';
+	if (rw & REQ_SECURE)
+		rwbs[i++] = 'E';
 
 	rwbs[i] = '\0';
 }
@@ -1755,8 +1830,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
 	int rw = rq->cmd_flags & 0x03;
 	int bytes;
 
-	if (blk_discard_rq(rq))
-		rw |= (1 << BIO_RW_DISCARD);
+	if (rq->cmd_flags & REQ_DISCARD)
+		rw |= REQ_DISCARD;
+
+	if (rq->cmd_flags & REQ_SECURE)
+		rw |= REQ_SECURE;
 
 	bytes = blk_rq_bytes(rq);
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e6640f8045..fa7ece649fe 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,12 +22,13 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
-#include <linux/kprobes.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
+#include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/hash.h>
+#include <linux/rcupdate.h>
 
 #include <trace/events/sched.h>
 
@@ -85,22 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
-#endif
-
+/*
+ * Traverse the ftrace_list, invoking all entries.  The reason that we
+ * can use rcu_dereference_raw() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism.  The rcu_dereference_raw() calls are needed to handle
+ * concurrent insertions into the ftrace_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
 static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
-	struct ftrace_ops *op = ftrace_list;
-
-	/* in case someone actually ports this to alpha! */
-	read_barrier_depends();
+	struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
 
 	while (op != &ftrace_list_end) {
-		/* silly alpha */
-		read_barrier_depends();
 		op->func(ip, parent_ip);
-		op = op->next;
+		op = rcu_dereference_raw(op->next); /*see above*/
 	};
 }
 
@@ -155,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	 * the ops->next pointer is valid before another CPU sees
 	 * the ops pointer included into the ftrace_list.
 	 */
-	smp_wmb();
-	ftrace_list = ops;
+	rcu_assign_pointer(ftrace_list, ops);
 
 	if (ftrace_enabled) {
 		ftrace_func_t func;
@@ -264,6 +264,7 @@ struct ftrace_profile {
 	unsigned long			counter;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	unsigned long long		time;
+	unsigned long long		time_squared;
 #endif
 };
 
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
 {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	seq_printf(m, "  Function                               "
-		   "Hit    Time            Avg\n"
+		   "Hit    Time            Avg             s^2\n"
 		      "  --------                               "
-		   "---    ----            ---\n");
+		   "---    ----            ---             ---\n");
 #else
 	seq_printf(m, "  Function                               Hit\n"
 		      "  --------                               ---\n");
@@ -380,11 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v)
 {
 	struct ftrace_profile *rec = v;
 	char str[KSYM_SYMBOL_LEN];
+	int ret = 0;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	static DEFINE_MUTEX(mutex);
 	static struct trace_seq s;
 	unsigned long long avg;
+	unsigned long long stddev;
 #endif
+	mutex_lock(&ftrace_profile_lock);
+
+	/* we raced with function_profile_reset() */
+	if (unlikely(rec->counter == 0)) {
+		ret = -EBUSY;
+		goto out;
+	}
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 	seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
@@ -394,17 +403,31 @@ static int function_stat_show(struct seq_file *m, void *v)
 	avg = rec->time;
 	do_div(avg, rec->counter);
 
-	mutex_lock(&mutex);
+	/* Sample standard deviation (s^2) */
+	if (rec->counter <= 1)
+		stddev = 0;
+	else {
+		stddev = rec->time_squared - rec->counter * avg * avg;
+		/*
+		 * Divide only 1000 for ns^2 -> us^2 conversion.
+		 * trace_print_graph_duration will divide 1000 again.
+		 */
+		do_div(stddev, (rec->counter - 1) * 1000);
+	}
+
 	trace_seq_init(&s);
 	trace_print_graph_duration(rec->time, &s);
 	trace_seq_puts(&s, "    ");
 	trace_print_graph_duration(avg, &s);
+	trace_seq_puts(&s, "    ");
+	trace_print_graph_duration(stddev, &s);
 	trace_print_seq(m, &s);
-	mutex_unlock(&mutex);
 #endif
 	seq_putc(m, '\n');
+out:
+	mutex_unlock(&ftrace_profile_lock);
 
-	return 0;
+	return ret;
 }
 
 static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -650,6 +673,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
+	/* If the calltime was zero'd ignore it */
+	if (!trace->calltime)
+		goto out;
+
 	calltime = trace->rettime - trace->calltime;
 
 	if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -668,8 +695,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	}
 
 	rec = ftrace_find_profiled_func(stat, trace->func);
-	if (rec)
+	if (rec) {
 		rec->time += calltime;
+		rec->time_squared += calltime * calltime;
+	}
 
  out:
 	local_irq_restore(flags);
@@ -898,36 +927,6 @@ static struct dyn_ftrace *ftrace_free_records;
 		}				\
 	}
 
-#ifdef CONFIG_KPROBES
-
-static int frozen_record_count;
-
-static inline void freeze_record(struct dyn_ftrace *rec)
-{
-	if (!(rec->flags & FTRACE_FL_FROZEN)) {
-		rec->flags |= FTRACE_FL_FROZEN;
-		frozen_record_count++;
-	}
-}
-
-static inline void unfreeze_record(struct dyn_ftrace *rec)
-{
-	if (rec->flags & FTRACE_FL_FROZEN) {
-		rec->flags &= ~FTRACE_FL_FROZEN;
-		frozen_record_count--;
-	}
-}
-
-static inline int record_frozen(struct dyn_ftrace *rec)
-{
-	return rec->flags & FTRACE_FL_FROZEN;
-}
-#else
-# define freeze_record(rec)			({ 0; })
-# define unfreeze_record(rec)			({ 0; })
-# define record_frozen(rec)			({ 0; })
-#endif /* CONFIG_KPROBES */
-
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
 	rec->freelist = ftrace_free_records;
@@ -1025,6 +1024,21 @@ static void ftrace_bug(int failed, unsigned long ip)
 }
 
 
+/* Return 1 if the address range is reserved for ftrace */
+int ftrace_text_reserved(void *start, void *end)
+{
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+
+	do_for_each_ftrace_rec(pg, rec) {
+		if (rec->ip <= (unsigned long)end &&
+		    rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
+			return 1;
+	} while_for_each_ftrace_rec();
+	return 0;
+}
+
+
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
@@ -1076,14 +1090,6 @@ static void ftrace_replace_code(int enable)
 		    !(rec->flags & FTRACE_FL_CONVERTED))
 			continue;
 
-		/* ignore updates to this record's mcount site */
-		if (get_kprobe((void *)rec->ip)) {
-			freeze_record(rec);
-			continue;
-		} else {
-			unfreeze_record(rec);
-		}
-
 		failed = __ftrace_replace_code(rec, enable);
 		if (failed) {
 			rec->flags |= FTRACE_FL_FAILED;
@@ -1504,6 +1510,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 		if (*pos > 0)
 			return t_hash_start(m, pos);
 		iter->flags |= FTRACE_ITER_PRINTALL;
+		/* reset in case of seek/pread */
+		iter->flags &= ~FTRACE_ITER_HASH;
 		return iter;
 	}
 
@@ -1884,7 +1892,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 	struct hlist_head *hhd;
 	struct hlist_node *n;
 	unsigned long key;
-	int resched;
 
 	key = hash_long(ip, FTRACE_HASH_BITS);
 
@@ -1898,12 +1905,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 	 * period. This syncs the hash iteration and freeing of items
 	 * on the hash. rcu_read_lock is too dangerous here.
 	 */
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 	hlist_for_each_entry_rcu(entry, n, hhd, node) {
 		if (entry->ip == ip)
 			entry->ops->func(ip, parent_ip, &entry->data);
 	}
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 }
 
 static struct ftrace_ops trace_probe_ops __read_mostly =
@@ -2300,6 +2307,8 @@ __setup("ftrace_filter=", set_ftrace_filter);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
+
 static int __init set_graph_function(char *str)
 {
 	strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -2409,7 +2418,7 @@ static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = no_llseek,
 	.release = ftrace_filter_release,
 };
 
@@ -2426,6 +2435,7 @@ static const struct file_operations ftrace_notrace_fops = {
 static DEFINE_MUTEX(graph_lock);
 
 int ftrace_graph_count;
+int ftrace_graph_filter_enabled;
 unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
 
 static void *
@@ -2448,7 +2458,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
 	mutex_lock(&graph_lock);
 
 	/* Nothing, tell g_show to print all functions are enabled */
-	if (!ftrace_graph_count && !*pos)
+	if (!ftrace_graph_filter_enabled && !*pos)
 		return (void *)1;
 
 	return __g_next(m, pos);
@@ -2494,6 +2504,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
 	mutex_lock(&graph_lock);
 	if ((file->f_mode & FMODE_WRITE) &&
 	    (file->f_flags & O_TRUNC)) {
+		ftrace_graph_filter_enabled = 0;
 		ftrace_graph_count = 0;
 		memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
 	}
@@ -2519,7 +2530,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 	int search_len;
-	int found = 0;
+	int fail = 1;
 	int type, not;
 	char *search;
 	bool exists;
@@ -2530,37 +2541,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
 
 	/* decode regex */
 	type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
-	if (not)
-		return -EINVAL;
+	if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
+		return -EBUSY;
 
 	search_len = strlen(search);
 
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
-		if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
-			break;
-
 		if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
 			continue;
 
 		if (ftrace_match_record(rec, search, search_len, type)) {
-			/* ensure it is not already in the array */
+			/* if it is in the array */
 			exists = false;
-			for (i = 0; i < *idx; i++)
+			for (i = 0; i < *idx; i++) {
 				if (array[i] == rec->ip) {
 					exists = true;
 					break;
 				}
-			if (!exists)
-				array[(*idx)++] = rec->ip;
-			found = 1;
+			}
+
+			if (!not) {
+				fail = 0;
+				if (!exists) {
+					array[(*idx)++] = rec->ip;
+					if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
+						goto out;
+				}
+			} else {
+				if (exists) {
+					array[i] = array[--(*idx)];
+					array[*idx] = 0;
+					fail = 0;
+				}
+			}
 		}
 	} while_for_each_ftrace_rec();
-
+out:
 	mutex_unlock(&ftrace_lock);
 
-	return found ? 0 : -EINVAL;
+	if (fail)
+		return -EINVAL;
+
+	ftrace_graph_filter_enabled = 1;
+	return 0;
 }
 
 static ssize_t
@@ -2570,16 +2595,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 	struct trace_parser parser;
 	ssize_t read, ret;
 
-	if (!cnt || cnt < 0)
+	if (!cnt)
 		return 0;
 
 	mutex_lock(&graph_lock);
 
-	if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
-		ret = -EBUSY;
-		goto out_unlock;
-	}
-
 	if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
 		ret = -ENOMEM;
 		goto out_unlock;
@@ -3222,8 +3242,8 @@ free:
 }
 
 static void
-ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
-				struct task_struct *next)
+ftrace_graph_probe_sched_switch(void *ignore,
+			struct task_struct *prev, struct task_struct *next)
 {
 	unsigned long long timestamp;
 	int index;
@@ -3277,7 +3297,7 @@ static int start_graph_tracing(void)
 	} while (ret == -EAGAIN);
 
 	if (!ret) {
-		ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+		ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
 		if (ret)
 			pr_info("ftrace_graph: Couldn't activate tracepoint"
 				" probe to kernel_sched_switch\n");
@@ -3349,11 +3369,11 @@ void unregister_ftrace_graph(void)
 		goto out;
 
 	ftrace_graph_active--;
-	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 	unregister_pm_notifier(&ftrace_suspend_notifier);
+	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
 
  out:
 	mutex_unlock(&ftrace_lock);
@@ -3364,6 +3384,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 {
 	/* Make sure we do not use the parent ret_stack */
 	t->ret_stack = NULL;
+	t->curr_ret_stack = -1;
 
 	if (ftrace_graph_active) {
 		struct ftrace_ret_stack *ret_stack;
@@ -3373,7 +3394,6 @@ void ftrace_graph_init_task(struct task_struct *t)
 				GFP_KERNEL);
 		if (!ret_stack)
 			return;
-		t->curr_ret_stack = -1;
 		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
 		t->ftrace_timestamp = 0;
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index a91da69f153..00000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Memory allocator tracing
- *
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- */
-
-#include <linux/tracepoint.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/dcache.h>
-#include <linux/fs.h>
-
-#include <linux/kmemtrace.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-/* Select an alternative, minimalistic output than the original one */
-#define TRACE_KMEM_OPT_MINIMAL	0x1
-
-static struct tracer_opt kmem_opts[] = {
-	/* Default disable the minimalistic output */
-	{ TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
-	{ }
-};
-
-static struct tracer_flags kmem_tracer_flags = {
-	.val			= 0,
-	.opts			= kmem_opts
-};
-
-static struct trace_array *kmemtrace_array;
-
-/* Trace allocations */
-static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
-				   unsigned long call_site,
-				   const void *ptr,
-				   size_t bytes_req,
-				   size_t bytes_alloc,
-				   gfp_t gfp_flags,
-				   int node)
-{
-	struct ftrace_event_call *call = &event_kmem_alloc;
-	struct trace_array *tr = kmemtrace_array;
-	struct kmemtrace_alloc_entry *entry;
-	struct ring_buffer_event *event;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
-	if (!event)
-		return;
-
-	entry = ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type		= TRACE_KMEM_ALLOC;
-	entry->type_id		= type_id;
-	entry->call_site	= call_site;
-	entry->ptr		= ptr;
-	entry->bytes_req	= bytes_req;
-	entry->bytes_alloc	= bytes_alloc;
-	entry->gfp_flags	= gfp_flags;
-	entry->node		= node;
-
-	if (!filter_check_discard(call, entry, tr->buffer, event))
-		ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-}
-
-static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
-				  unsigned long call_site,
-				  const void *ptr)
-{
-	struct ftrace_event_call *call = &event_kmem_free;
-	struct trace_array *tr = kmemtrace_array;
-	struct kmemtrace_free_entry *entry;
-	struct ring_buffer_event *event;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type		= TRACE_KMEM_FREE;
-	entry->type_id		= type_id;
-	entry->call_site	= call_site;
-	entry->ptr		= ptr;
-
-	if (!filter_check_discard(call, entry, tr->buffer, event))
-		ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-}
-
-static void kmemtrace_kmalloc(unsigned long call_site,
-			      const void *ptr,
-			      size_t bytes_req,
-			      size_t bytes_alloc,
-			      gfp_t gfp_flags)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
-				       const void *ptr,
-				       size_t bytes_req,
-				       size_t bytes_alloc,
-				       gfp_t gfp_flags)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmalloc_node(unsigned long call_site,
-				   const void *ptr,
-				   size_t bytes_req,
-				   size_t bytes_alloc,
-				   gfp_t gfp_flags,
-				   int node)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
-					    const void *ptr,
-					    size_t bytes_req,
-					    size_t bytes_alloc,
-					    gfp_t gfp_flags,
-					    int node)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
-{
-	kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
-}
-
-static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
-{
-	kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
-}
-
-static int kmemtrace_start_probes(void)
-{
-	int err;
-
-	err = register_trace_kmalloc(kmemtrace_kmalloc);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
-	if (err)
-		return err;
-	err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
-	if (err)
-		return err;
-	err = register_trace_kfree(kmemtrace_kfree);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
-
-	return err;
-}
-
-static void kmemtrace_stop_probes(void)
-{
-	unregister_trace_kmalloc(kmemtrace_kmalloc);
-	unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
-	unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
-	unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
-	unregister_trace_kfree(kmemtrace_kfree);
-	unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
-}
-
-static int kmem_trace_init(struct trace_array *tr)
-{
-	kmemtrace_array = tr;
-
-	tracing_reset_online_cpus(tr);
-
-	kmemtrace_start_probes();
-
-	return 0;
-}
-
-static void kmem_trace_reset(struct trace_array *tr)
-{
-	kmemtrace_stop_probes();
-}
-
-static void kmemtrace_headers(struct seq_file *s)
-{
-	/* Don't need headers for the original kmemtrace output */
-	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
-		return;
-
-	seq_printf(s, "#\n");
-	seq_printf(s, "# ALLOC  TYPE  REQ   GIVEN  FLAGS     "
-			"      POINTER         NODE    CALLER\n");
-	seq_printf(s, "# FREE   |      |     |       |       "
-			"       |   |            |        |\n");
-	seq_printf(s, "# |\n\n");
-}
-
-/*
- * The following functions give the original output from kmemtrace,
- * plus the origin CPU, since reordering occurs in-kernel now.
- */
-
-#define KMEMTRACE_USER_ALLOC	0
-#define KMEMTRACE_USER_FREE	1
-
-struct kmemtrace_user_event {
-	u8			event_id;
-	u8			type_id;
-	u16			event_size;
-	u32			cpu;
-	u64			timestamp;
-	unsigned long		call_site;
-	unsigned long		ptr;
-};
-
-struct kmemtrace_user_event_alloc {
-	size_t			bytes_req;
-	size_t			bytes_alloc;
-	unsigned		gfp_flags;
-	int			node;
-};
-
-static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_alloc_entry *entry;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
-	    "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
-	    entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
-	    (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
-	    (unsigned long)entry->gfp_flags, entry->node);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_free_entry *entry;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
-			       entry->type_id, (void *)entry->call_site,
-			       (unsigned long)entry->ptr);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_alloc_entry *entry;
-	struct kmemtrace_user_event *ev;
-	struct kmemtrace_user_event_alloc *ev_alloc;
-
-	trace_assign_type(entry, iter->ent);
-
-	ev = trace_seq_reserve(s, sizeof(*ev));
-	if (!ev)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev->event_id		= KMEMTRACE_USER_ALLOC;
-	ev->type_id		= entry->type_id;
-	ev->event_size		= sizeof(*ev) + sizeof(*ev_alloc);
-	ev->cpu			= iter->cpu;
-	ev->timestamp		= iter->ts;
-	ev->call_site		= entry->call_site;
-	ev->ptr			= (unsigned long)entry->ptr;
-
-	ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
-	if (!ev_alloc)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev_alloc->bytes_req	= entry->bytes_req;
-	ev_alloc->bytes_alloc	= entry->bytes_alloc;
-	ev_alloc->gfp_flags	= entry->gfp_flags;
-	ev_alloc->node		= entry->node;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_free_entry *entry;
-	struct kmemtrace_user_event *ev;
-
-	trace_assign_type(entry, iter->ent);
-
-	ev = trace_seq_reserve(s, sizeof(*ev));
-	if (!ev)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev->event_id		= KMEMTRACE_USER_FREE;
-	ev->type_id		= entry->type_id;
-	ev->event_size		= sizeof(*ev);
-	ev->cpu			= iter->cpu;
-	ev->timestamp		= iter->ts;
-	ev->call_site		= entry->call_site;
-	ev->ptr			= (unsigned long)entry->ptr;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-/* The two other following provide a more minimalistic output */
-static enum print_line_t
-kmemtrace_print_alloc_compress(struct trace_iterator *iter)
-{
-	struct kmemtrace_alloc_entry *entry;
-	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	/* Alloc entry */
-	ret = trace_seq_printf(s, "  +      ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Type */
-	switch (entry->type_id) {
-	case KMEMTRACE_TYPE_KMALLOC:
-		ret = trace_seq_printf(s, "K   ");
-		break;
-	case KMEMTRACE_TYPE_CACHE:
-		ret = trace_seq_printf(s, "C   ");
-		break;
-	case KMEMTRACE_TYPE_PAGES:
-		ret = trace_seq_printf(s, "P   ");
-		break;
-	default:
-		ret = trace_seq_printf(s, "?   ");
-	}
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Requested */
-	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_req);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Allocated */
-	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_alloc);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Flags
-	 * TODO: would be better to see the name of the GFP flag names
-	 */
-	ret = trace_seq_printf(s, "%08x   ", entry->gfp_flags);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Pointer to allocated */
-	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Node and call site*/
-	ret = trace_seq_printf(s, "%4d   %pf\n", entry->node,
-						 (void *)entry->call_site);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_compress(struct trace_iterator *iter)
-{
-	struct kmemtrace_free_entry *entry;
-	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	/* Free entry */
-	ret = trace_seq_printf(s, "  -      ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Type */
-	switch (entry->type_id) {
-	case KMEMTRACE_TYPE_KMALLOC:
-		ret = trace_seq_printf(s, "K     ");
-		break;
-	case KMEMTRACE_TYPE_CACHE:
-		ret = trace_seq_printf(s, "C     ");
-		break;
-	case KMEMTRACE_TYPE_PAGES:
-		ret = trace_seq_printf(s, "P     ");
-		break;
-	default:
-		ret = trace_seq_printf(s, "?     ");
-	}
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Skip requested/allocated/flags */
-	ret = trace_seq_printf(s, "                       ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Pointer to allocated */
-	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Skip node and print call site*/
-	ret = trace_seq_printf(s, "       %pf\n", (void *)entry->call_site);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-
-	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
-		return TRACE_TYPE_UNHANDLED;
-
-	switch (entry->type) {
-	case TRACE_KMEM_ALLOC:
-		return kmemtrace_print_alloc_compress(iter);
-	case TRACE_KMEM_FREE:
-		return kmemtrace_print_free_compress(iter);
-	default:
-		return TRACE_TYPE_UNHANDLED;
-	}
-}
-
-static struct trace_event kmem_trace_alloc = {
-	.type			= TRACE_KMEM_ALLOC,
-	.trace			= kmemtrace_print_alloc,
-	.binary			= kmemtrace_print_alloc_user,
-};
-
-static struct trace_event kmem_trace_free = {
-	.type			= TRACE_KMEM_FREE,
-	.trace			= kmemtrace_print_free,
-	.binary			= kmemtrace_print_free_user,
-};
-
-static struct tracer kmem_tracer __read_mostly = {
-	.name			= "kmemtrace",
-	.init			= kmem_trace_init,
-	.reset			= kmem_trace_reset,
-	.print_line		= kmemtrace_print_line,
-	.print_header		= kmemtrace_headers,
-	.flags			= &kmem_tracer_flags
-};
-
-void kmemtrace_init(void)
-{
-	/* earliest opportunity to start kmem tracing */
-}
-
-static int __init init_kmem_tracer(void)
-{
-	if (!register_ftrace_event(&kmem_trace_alloc)) {
-		pr_warning("Warning: could not register kmem events\n");
-		return 1;
-	}
-
-	if (!register_ftrace_event(&kmem_trace_free)) {
-		pr_warning("Warning: could not register kmem events\n");
-		return 1;
-	}
-
-	if (register_tracer(&kmem_tracer) != 0) {
-		pr_warning("Warning: could not register the kmem tracer\n");
-		return 1;
-	}
-
-	return 0;
-}
-device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 9f4f565b01e..a22582a0616 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -9,7 +9,6 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/module.h>
-#include <linux/slab.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index edefe3b2801..bca96377fd4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -14,12 +14,14 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/fs.h>
 
+#include <asm/local.h>
 #include "trace.h"
 
 /*
@@ -206,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 #define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 #define RB_EVNT_MIN_SIZE	8U	/* two 32bit words */
 
+#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+# define RB_FORCE_8BYTE_ALIGNMENT	0
+# define RB_ARCH_ALIGNMENT		RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT	1
+# define RB_ARCH_ALIGNMENT		8U
+#endif
+
 /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
 #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
 
@@ -309,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 #define TS_MASK		((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST	(~TS_MASK)
 
+/* Flag when events were overwritten */
+#define RB_MISSED_EVENTS	(1 << 31)
+/* Missed count stored at end */
+#define RB_MISSED_STORED	(1 << 30)
+
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
 	local_t		 commit;	/* write committed index */
@@ -328,6 +343,7 @@ struct buffer_page {
 	local_t		 write;		/* index for next write */
 	unsigned	 read;		/* index for next read */
 	local_t		 entries;	/* entries on this page */
+	unsigned long	 real_end;	/* real end of data */
 	struct buffer_data_page *page;	/* Actual data page */
 };
 
@@ -389,7 +405,7 @@ static inline int test_time_stamp(u64 delta)
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
 
 /* Max number of timestamps that can fit on a page */
-#define RB_TIMESTAMPS_PER_PAGE	(BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+#define RB_TIMESTAMPS_PER_PAGE	(BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
 
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
@@ -407,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 			       (unsigned int)sizeof(field.commit),
 			       (unsigned int)is_signed_type(long));
 
+	ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
+			       "offset:%u;\tsize:%u;\tsigned:%u;\n",
+			       (unsigned int)offsetof(typeof(field), commit),
+			       1,
+			       (unsigned int)is_signed_type(long));
+
 	ret = trace_seq_printf(s, "\tfield: char data;\t"
 			       "offset:%u;\tsize:%u;\tsigned:%u;\n",
 			       (unsigned int)offsetof(typeof(field), data),
@@ -421,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
  */
 struct ring_buffer_per_cpu {
 	int				cpu;
+	atomic_t			record_disabled;
 	struct ring_buffer		*buffer;
 	spinlock_t			reader_lock;	/* serialize readers */
 	arch_spinlock_t			lock;
@@ -430,6 +453,8 @@ struct ring_buffer_per_cpu {
 	struct buffer_page		*tail_page;	/* write to tail */
 	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
+	unsigned long			lost_events;
+	unsigned long			last_overrun;
 	local_t				commit_overrun;
 	local_t				overrun;
 	local_t				entries;
@@ -438,7 +463,6 @@ struct ring_buffer_per_cpu {
 	unsigned long			read;
 	u64				write_stamp;
 	u64				read_stamp;
-	atomic_t			record_disabled;
 };
 
 struct ring_buffer {
@@ -464,6 +488,8 @@ struct ring_buffer_iter {
 	struct ring_buffer_per_cpu	*cpu_buffer;
 	unsigned long			head;
 	struct buffer_page		*head_page;
+	struct buffer_page		*cache_reader_page;
+	unsigned long			cache_read;
 	u64				read_stamp;
 };
 
@@ -1198,18 +1224,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 
 	for (i = 0; i < nr_pages; i++) {
 		if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-			return;
+			goto out;
 		p = cpu_buffer->pages->next;
 		bpage = list_entry(p, struct buffer_page, list);
 		list_del_init(&bpage->list);
 		free_buffer_page(bpage);
 	}
 	if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-		return;
+		goto out;
 
 	rb_reset_cpu(cpu_buffer);
 	rb_check_pages(cpu_buffer);
 
+out:
 	spin_unlock_irq(&cpu_buffer->reader_lock);
 }
 
@@ -1226,7 +1253,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 
 	for (i = 0; i < nr_pages; i++) {
 		if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
-			return;
+			goto out;
 		p = pages->next;
 		bpage = list_entry(p, struct buffer_page, list);
 		list_del_init(&bpage->list);
@@ -1235,6 +1262,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 	rb_reset_cpu(cpu_buffer);
 	rb_check_pages(cpu_buffer);
 
+out:
 	spin_unlock_irq(&cpu_buffer->reader_lock);
 }
 
@@ -1544,7 +1572,7 @@ rb_update_event(struct ring_buffer_event *event,
 
 	case 0:
 		length -= RB_EVNT_HDR_SIZE;
-		if (length > RB_MAX_SMALL_DATA)
+		if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
 			event->array[0] = length;
 		else
 			event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
@@ -1719,11 +1747,11 @@ static unsigned rb_calculate_event_length(unsigned length)
 	if (!length)
 		length = 1;
 
-	if (length > RB_MAX_SMALL_DATA)
+	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
 		length += sizeof(event.array[0]);
 
 	length += RB_EVNT_HDR_SIZE;
-	length = ALIGN(length, RB_ALIGNMENT);
+	length = ALIGN(length, RB_ARCH_ALIGNMENT);
 
 	return length;
 }
@@ -1740,6 +1768,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	 * must fill the old tail_page with padding.
 	 */
 	if (tail >= BUF_PAGE_SIZE) {
+		/*
+		 * If the page was filled, then we still need
+		 * to update the real_end. Reset it to zero
+		 * and the reader will ignore it.
+		 */
+		if (tail == BUF_PAGE_SIZE)
+			tail_page->real_end = 0;
+
 		local_sub(length, &tail_page->write);
 		return;
 	}
@@ -1748,6 +1784,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	kmemcheck_annotate_bitfield(event, bitfield);
 
 	/*
+	 * Save the original length to the meta data.
+	 * This will be used by the reader to add lost event
+	 * counter.
+	 */
+	tail_page->real_end = tail;
+
+	/*
 	 * If this event is bigger than the minimum size, then
 	 * we need to be careful that we don't subtract the
 	 * write counter enough to allow another writer to slip
@@ -1965,17 +2008,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  u64 *ts, u64 *delta)
 {
 	struct ring_buffer_event *event;
-	static int once;
 	int ret;
 
-	if (unlikely(*delta > (1ULL << 59) && !once++)) {
-		printk(KERN_WARNING "Delta way too big! %llu"
-		       " ts=%llu write stamp = %llu\n",
-		       (unsigned long long)*delta,
-		       (unsigned long long)*ts,
-		       (unsigned long long)cpu_buffer->write_stamp);
-		WARN_ON(1);
-	}
+	WARN_ONCE(*delta > (1ULL << 59),
+		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+		  (unsigned long long)*delta,
+		  (unsigned long long)*ts,
+		  (unsigned long long)cpu_buffer->write_stamp);
 
 	/*
 	 * The delta is too big, we to add a
@@ -2203,8 +2242,6 @@ static void trace_recursive_unlock(void)
 
 #endif
 
-static DEFINE_PER_CPU(int, rb_need_resched);
-
 /**
  * ring_buffer_lock_reserve - reserve a part of the buffer
  * @buffer: the ring buffer to reserve from
@@ -2225,16 +2262,16 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
-	int cpu, resched;
+	int cpu;
 
 	if (ring_buffer_flags != RB_BUFFERS_ON)
 		return NULL;
 
-	if (atomic_read(&buffer->record_disabled))
-		return NULL;
-
 	/* If we are tracing schedule, we don't want to recurse */
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
+
+	if (atomic_read(&buffer->record_disabled))
+		goto out_nocheck;
 
 	if (trace_recursive_lock())
 		goto out_nocheck;
@@ -2256,21 +2293,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	if (!event)
 		goto out;
 
-	/*
-	 * Need to store resched state on this cpu.
-	 * Only the first needs to.
-	 */
-
-	if (preempt_count() == 1)
-		per_cpu(rb_need_resched, cpu) = resched;
-
 	return event;
 
  out:
 	trace_recursive_unlock();
 
  out_nocheck:
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -2316,13 +2345,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
 	trace_recursive_unlock();
 
-	/*
-	 * Only the last preempt count needs to restore preemption.
-	 */
-	if (preempt_count() == 1)
-		ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
-	else
-		preempt_enable_no_resched_notrace();
+	preempt_enable_notrace();
 
 	return 0;
 }
@@ -2430,13 +2453,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 
 	trace_recursive_unlock();
 
-	/*
-	 * Only the last preempt count needs to restore preemption.
-	 */
-	if (preempt_count() == 1)
-		ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
-	else
-		preempt_enable_no_resched_notrace();
+	preempt_enable_notrace();
 
 }
 EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -2462,15 +2479,15 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	void *body;
 	int ret = -EBUSY;
-	int cpu, resched;
+	int cpu;
 
 	if (ring_buffer_flags != RB_BUFFERS_ON)
 		return -EBUSY;
 
-	if (atomic_read(&buffer->record_disabled))
-		return -EBUSY;
+	preempt_disable_notrace();
 
-	resched = ftrace_preempt_disable();
+	if (atomic_read(&buffer->record_disabled))
+		goto out;
 
 	cpu = raw_smp_processor_id();
 
@@ -2497,7 +2514,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 
 	ret = 0;
  out:
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 
 	return ret;
 }
@@ -2539,7 +2556,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
  * @buffer: The ring buffer to enable writes
  *
  * Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
  */
 void ring_buffer_record_enable(struct ring_buffer *buffer)
 {
@@ -2575,7 +2592,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
  * @cpu: The CPU to enable.
  *
  * Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
  */
 void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
@@ -2716,6 +2733,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
 		iter->read_stamp = cpu_buffer->read_stamp;
 	else
 		iter->read_stamp = iter->head_page->page->time_stamp;
+	iter->cache_reader_page = cpu_buffer->reader_page;
+	iter->cache_read = cpu_buffer->read;
 }
 
 /**
@@ -2822,6 +2841,7 @@ static struct buffer_page *
 rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct buffer_page *reader = NULL;
+	unsigned long overwrite;
 	unsigned long flags;
 	int nr_loops = 0;
 	int ret;
@@ -2863,6 +2883,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->reader_page->write, 0);
 	local_set(&cpu_buffer->reader_page->entries, 0);
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
+	cpu_buffer->reader_page->real_end = 0;
 
  spin:
 	/*
@@ -2883,6 +2904,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
 
 	/*
+	 * We want to make sure we read the overruns after we set up our
+	 * pointers to the next object. The writer side does a
+	 * cmpxchg to cross pages which acts as the mb on the writer
+	 * side. Note, the reader will constantly fail the swap
+	 * while the writer is updating the pointers, so this
+	 * guarantees that the overwrite recorded here is the one we
+	 * want to compare with the last_overrun.
+	 */
+	smp_mb();
+	overwrite = local_read(&(cpu_buffer->overrun));
+
+	/*
 	 * Here's the tricky part.
 	 *
 	 * We need to move the pointer past the header page.
@@ -2913,6 +2946,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page = reader;
 	rb_reset_reader_page(cpu_buffer);
 
+	if (overwrite != cpu_buffer->last_overrun) {
+		cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
+		cpu_buffer->last_overrun = overwrite;
+	}
+
 	goto again;
 
  out:
@@ -2947,13 +2985,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 static void rb_advance_iter(struct ring_buffer_iter *iter)
 {
-	struct ring_buffer *buffer;
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
 	unsigned length;
 
 	cpu_buffer = iter->cpu_buffer;
-	buffer = cpu_buffer->buffer;
 
 	/*
 	 * Check if we are at the end of the buffer.
@@ -2989,8 +3025,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 		rb_advance_iter(iter);
 }
 
+static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	return cpu_buffer->lost_events;
+}
+
 static struct ring_buffer_event *
-rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
+	       unsigned long *lost_events)
 {
 	struct ring_buffer_event *event;
 	struct buffer_page *reader;
@@ -3042,6 +3084,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
 			ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
 							 cpu_buffer->cpu, ts);
 		}
+		if (lost_events)
+			*lost_events = rb_lost_events(cpu_buffer);
 		return event;
 
 	default:
@@ -3060,13 +3104,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_event *event;
 	int nr_loops = 0;
 
-	if (ring_buffer_iter_empty(iter))
-		return NULL;
-
 	cpu_buffer = iter->cpu_buffer;
 	buffer = cpu_buffer->buffer;
 
+	/*
+	 * Check if someone performed a consuming read to
+	 * the buffer. A consuming read invalidates the iterator
+	 * and we need to reset the iterator in this case.
+	 */
+	if (unlikely(iter->cache_read != cpu_buffer->read ||
+		     iter->cache_reader_page != cpu_buffer->reader_page))
+		rb_iter_reset(iter);
+
  again:
+	if (ring_buffer_iter_empty(iter))
+		return NULL;
+
 	/*
 	 * We repeat when a timestamp is encountered.
 	 * We can get multiple timestamps by nested interrupts or also
@@ -3081,6 +3134,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	if (rb_per_cpu_empty(cpu_buffer))
 		return NULL;
 
+	if (iter->head >= local_read(&iter->head_page->page->commit)) {
+		rb_inc_iter(iter);
+		goto again;
+	}
+
 	event = rb_iter_head_event(iter);
 
 	switch (event->type_len) {
@@ -3138,12 +3196,14 @@ static inline int rb_ok_to_lock(void)
  * @buffer: The ring buffer to read
  * @cpu: The cpu to peak at
  * @ts: The timestamp counter of this event.
+ * @lost_events: a variable to store if events were lost (may be NULL)
  *
  * This will return the event that will be read next, but does
  * not consume the data.
  */
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+		 unsigned long *lost_events)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
@@ -3158,7 +3218,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	local_irq_save(flags);
 	if (dolock)
 		spin_lock(&cpu_buffer->reader_lock);
-	event = rb_buffer_peek(cpu_buffer, ts);
+	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
 	if (event && event->type_len == RINGBUF_TYPE_PADDING)
 		rb_advance_reader(cpu_buffer);
 	if (dolock)
@@ -3200,13 +3260,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 /**
  * ring_buffer_consume - return an event and consume it
  * @buffer: The ring buffer to get the next event from
+ * @cpu: the cpu to read the buffer from
+ * @ts: a variable to store the timestamp (may be NULL)
+ * @lost_events: a variable to store if events were lost (may be NULL)
  *
  * Returns the next event in the ring buffer, and that event is consumed.
  * Meaning, that sequential reads will keep returning a different event,
  * and eventually empty the ring buffer if the producer is slower.
  */
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+		    unsigned long *lost_events)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event = NULL;
@@ -3227,9 +3291,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	if (dolock)
 		spin_lock(&cpu_buffer->reader_lock);
 
-	event = rb_buffer_peek(cpu_buffer, ts);
-	if (event)
+	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
+	if (event) {
+		cpu_buffer->lost_events = 0;
 		rb_advance_reader(cpu_buffer);
+	}
 
 	if (dolock)
 		spin_unlock(&cpu_buffer->reader_lock);
@@ -3246,23 +3312,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_start - start a non consuming read of the buffer
+ * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  *
- * This starts up an iteration through the buffer. It also disables
- * the recording to the buffer until the reading is finished.
- * This prevents the reading from being corrupted. This is not
- * a consuming read, so a producer is not expected.
+ * This performs the initial preparations necessary to iterate
+ * through the buffer.  Memory is allocated, buffer recording
+ * is disabled, and the iterator pointer is returned to the caller.
  *
- * Must be paired with ring_buffer_finish.
+ * Disabling buffer recordng prevents the reading from being
+ * corrupted. This is not a consuming read, so a producer is not
+ * expected.
+ *
+ * After a sequence of ring_buffer_read_prepare calls, the user is
+ * expected to make at least one call to ring_buffer_prepare_sync.
+ * Afterwards, ring_buffer_read_start is invoked to get things going
+ * for real.
+ *
+ * This overall must be paired with ring_buffer_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
-	unsigned long flags;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
@@ -3276,15 +3349,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	iter->cpu_buffer = cpu_buffer;
 
 	atomic_inc(&cpu_buffer->record_disabled);
+
+	return iter;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
+
+/**
+ * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
+ *
+ * All previously invoked ring_buffer_read_prepare calls to prepare
+ * iterators will be synchronized.  Afterwards, read_buffer_read_start
+ * calls on those iterators are allowed.
+ */
+void
+ring_buffer_read_prepare_sync(void)
+{
 	synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @iter: The iterator returned by ring_buffer_read_prepare
+ *
+ * This finalizes the startup of an iteration through the buffer.
+ * The iterator comes from a call to ring_buffer_read_prepare and
+ * an intervening ring_buffer_read_prepare_sync must have been
+ * performed.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+void
+ring_buffer_read_start(struct ring_buffer_iter *iter)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long flags;
+
+	if (!iter)
+		return;
+
+	cpu_buffer = iter->cpu_buffer;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	arch_spin_lock(&cpu_buffer->lock);
 	rb_iter_reset(iter);
 	arch_spin_unlock(&cpu_buffer->lock);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
-	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
@@ -3378,6 +3488,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->write_stamp = 0;
 	cpu_buffer->read_stamp = 0;
 
+	cpu_buffer->lost_events = 0;
+	cpu_buffer->last_overrun = 0;
+
 	rb_head_page_activate(cpu_buffer);
 }
 
@@ -3653,6 +3766,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct buffer_data_page *bpage;
 	struct buffer_page *reader;
+	unsigned long missed_events;
 	unsigned long flags;
 	unsigned int commit;
 	unsigned int read;
@@ -3689,6 +3803,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	read = reader->read;
 	commit = rb_page_commit(reader);
 
+	/* Check if any events were dropped */
+	missed_events = cpu_buffer->lost_events;
+
 	/*
 	 * If this page has been partially read or
 	 * if len is not big enough to read the rest of the page or
@@ -3727,6 +3844,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 			rpos = reader->read;
 			pos += size;
 
+			if (rpos >= commit)
+				break;
+
 			event = rb_reader_event(cpu_buffer);
 			size = rb_event_length(event);
 		} while (len > size);
@@ -3749,9 +3869,42 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		local_set(&reader->entries, 0);
 		reader->read = 0;
 		*data_page = bpage;
+
+		/*
+		 * Use the real_end for the data size,
+		 * This gives us a chance to store the lost events
+		 * on the page.
+		 */
+		if (reader->real_end)
+			local_set(&bpage->commit, reader->real_end);
 	}
 	ret = read;
 
+	cpu_buffer->lost_events = 0;
+
+	commit = local_read(&bpage->commit);
+	/*
+	 * Set a flag in the commit field if we lost events
+	 */
+	if (missed_events) {
+		/* If there is room at the end of the page to save the
+		 * missed events, then record it there.
+		 */
+		if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+			memcpy(&bpage->data[commit], &missed_events,
+			       sizeof(missed_events));
+			local_add(RB_MISSED_STORED, &bpage->commit);
+			commit += sizeof(missed_events);
+		}
+		local_add(RB_MISSED_EVENTS, &bpage->commit);
+	}
+
+	/*
+	 * This page may be off to user land. Zero it out here.
+	 */
+	if (commit < BUF_PAGE_SIZE)
+		memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+
  out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index b2477caf09c..302f8a61463 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/time.h>
+#include <asm/local.h>
 
 struct rb_page {
 	u64		ts;
@@ -80,7 +81,7 @@ static enum event_status read_event(int cpu)
 	int *entry;
 	u64 ts;
 
-	event = ring_buffer_consume(buffer, cpu, &ts);
+	event = ring_buffer_consume(buffer, cpu, &ts, NULL);
 	if (!event)
 		return EVENT_DROPPED;
 
@@ -112,7 +113,8 @@ static enum event_status read_page(int cpu)
 	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
 	if (ret >= 0) {
 		rpage = bpage;
-		commit = local_read(&rpage->commit);
+		/* The commit may have missed event flags set, clear them */
+		commit = local_read(&rpage->commit) & 0xfffff;
 		for (i = 0; i < commit && !kill_test; i += inc) {
 
 			if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0df1b0f2cb9..9ec59f54115 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,10 +32,11 @@
 #include <linux/splice.h>
 #include <linux/kdebug.h>
 #include <linux/string.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
-#include <linux/gfp.h>
 #include <linux/fs.h>
 
 #include "trace.h"
@@ -91,22 +92,16 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled);
 static inline void ftrace_disable_cpu(void)
 {
 	preempt_disable();
-	__this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
+	__this_cpu_inc(ftrace_cpu_disabled);
 }
 
 static inline void ftrace_enable_cpu(void)
 {
-	__this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
+	__this_cpu_dec(ftrace_cpu_disabled);
 	preempt_enable();
 }
 
-static cpumask_var_t __read_mostly	tracing_buffer_mask;
-
-/* Define which cpu buffers are currently read in trace_pipe */
-static cpumask_var_t			tracing_reader_cpumask;
-
-#define for_each_tracing_cpu(cpu)	\
-	for_each_cpu(cpu, tracing_buffer_mask)
+cpumask_var_t __read_mostly	tracing_buffer_mask;
 
 /*
  * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -119,9 +114,12 @@ static cpumask_var_t			tracing_reader_cpumask;
  *
  * It is default off, but you can enable it with either specifying
  * "ftrace_dump_on_oops" in the kernel command line, or setting
- * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ * /proc/sys/kernel/ftrace_dump_on_oops
+ * Set 1 if you want to dump buffers of all CPUs
+ * Set 2 if you want to dump the buffer of the CPU that triggered oops
  */
-int ftrace_dump_on_oops;
+
+enum ftrace_dump_mode ftrace_dump_on_oops;
 
 static int tracing_set_tracer(const char *buf);
 
@@ -141,8 +139,17 @@ __setup("ftrace=", set_cmdline_ftrace);
 
 static int __init set_ftrace_dump_on_oops(char *str)
 {
-	ftrace_dump_on_oops = 1;
-	return 1;
+	if (*str++ != '=' || !*str) {
+		ftrace_dump_on_oops = DUMP_ALL;
+		return 1;
+	}
+
+	if (!strcmp("orig_cpu", str)) {
+		ftrace_dump_on_oops = DUMP_ORIG;
+                return 1;
+        }
+
+        return 0;
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
 
@@ -243,19 +250,98 @@ static struct tracer		*current_trace __read_mostly;
 
 /*
  * trace_types_lock is used to protect the trace_types list.
- * This lock is also used to keep user access serialized.
- * Accesses from userspace will grab this lock while userspace
- * activities happen inside the kernel.
  */
 static DEFINE_MUTEX(trace_types_lock);
 
+/*
+ * serialize the access of the ring buffer
+ *
+ * ring buffer serializes readers, but it is low level protection.
+ * The validity of the events (which returns by ring_buffer_peek() ..etc)
+ * are not protected by ring buffer.
+ *
+ * The content of events may become garbage if we allow other process consumes
+ * these events concurrently:
+ *   A) the page of the consumed events may become a normal page
+ *      (not reader page) in ring buffer, and this page will be rewrited
+ *      by events producer.
+ *   B) The page of the consumed events may become a page for splice_read,
+ *      and this page will be returned to system.
+ *
+ * These primitives allow multi process access to different cpu ring buffer
+ * concurrently.
+ *
+ * These primitives don't distinguish read-only and read-consume access.
+ * Multi read-only access are also serialized.
+ */
+
+#ifdef CONFIG_SMP
+static DECLARE_RWSEM(all_cpu_access_lock);
+static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
+
+static inline void trace_access_lock(int cpu)
+{
+	if (cpu == TRACE_PIPE_ALL_CPU) {
+		/* gain it for accessing the whole ring buffer. */
+		down_write(&all_cpu_access_lock);
+	} else {
+		/* gain it for accessing a cpu ring buffer. */
+
+		/* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
+		down_read(&all_cpu_access_lock);
+
+		/* Secondly block other access to this @cpu ring buffer. */
+		mutex_lock(&per_cpu(cpu_access_lock, cpu));
+	}
+}
+
+static inline void trace_access_unlock(int cpu)
+{
+	if (cpu == TRACE_PIPE_ALL_CPU) {
+		up_write(&all_cpu_access_lock);
+	} else {
+		mutex_unlock(&per_cpu(cpu_access_lock, cpu));
+		up_read(&all_cpu_access_lock);
+	}
+}
+
+static inline void trace_access_lock_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		mutex_init(&per_cpu(cpu_access_lock, cpu));
+}
+
+#else
+
+static DEFINE_MUTEX(access_lock);
+
+static inline void trace_access_lock(int cpu)
+{
+	(void)cpu;
+	mutex_lock(&access_lock);
+}
+
+static inline void trace_access_unlock(int cpu)
+{
+	(void)cpu;
+	mutex_unlock(&access_lock);
+}
+
+static inline void trace_access_lock_init(void)
+{
+}
+
+#endif
+
 /* trace_wait is a waitqueue for tasks blocked on trace_poll */
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-	TRACE_ITER_GRAPH_TIME;
+	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
 
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
@@ -297,6 +383,21 @@ static int __init set_buf_size(char *str)
 }
 __setup("trace_buf_size=", set_buf_size);
 
+static int __init set_tracing_thresh(char *str)
+{
+	unsigned long threshhold;
+	int ret;
+
+	if (!str)
+		return 0;
+	ret = strict_strtoul(str, 0, &threshhold);
+	if (ret < 0)
+		return 0;
+	tracing_thresh = threshhold * 1000;
+	return 1;
+}
+__setup("tracing_thresh=", set_tracing_thresh);
+
 unsigned long nsecs_to_usecs(unsigned long nsecs)
 {
 	return nsecs / 1000;
@@ -324,6 +425,7 @@ static const char *trace_options[] = {
 	"latency-format",
 	"sleep-time",
 	"graph-time",
+	"record-cmd",
 	NULL
 };
 
@@ -502,9 +604,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 static arch_spinlock_t ftrace_max_lock =
 	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 
+unsigned long __read_mostly	tracing_thresh;
+
 #ifdef CONFIG_TRACER_MAX_TRACE
 unsigned long __read_mostly	tracing_max_latency;
-unsigned long __read_mostly	tracing_thresh;
 
 /*
  * Copy the new maximum trace into the separate maximum-trace
@@ -515,7 +618,7 @@ static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	struct trace_array_cpu *data = tr->data[cpu];
-	struct trace_array_cpu *max_data = tr->data[cpu];
+	struct trace_array_cpu *max_data;
 
 	max_tr.cpu = cpu;
 	max_tr.time_start = data->preempt_timestamp;
@@ -525,7 +628,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	max_data->critical_start = data->critical_start;
 	max_data->critical_end = data->critical_end;
 
-	memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+	memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
 	max_data->pid = tsk->pid;
 	max_data->uid = task_uid(tsk);
 	max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -554,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
+	if (!current_trace->use_max_tr) {
+		WARN_ON_ONCE(1);
+		return;
+	}
 	arch_spin_lock(&ftrace_max_lock);
 
 	tr->buffer = max_tr.buffer;
@@ -580,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
+	if (!current_trace->use_max_tr) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
 	arch_spin_lock(&ftrace_max_lock);
 
 	ftrace_disable_cpu();
@@ -624,18 +736,11 @@ __acquires(kernel_lock)
 		return -1;
 	}
 
-	if (strlen(type->name) > MAX_TRACER_SIZE) {
+	if (strlen(type->name) >= MAX_TRACER_SIZE) {
 		pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
 		return -1;
 	}
 
-	/*
-	 * When this gets called we hold the BKL which means that
-	 * preemption is disabled. Various trace selftests however
-	 * need to disable and enable preemption for successful tests.
-	 * So we drop the BKL here and grab it after the tests again.
-	 */
-	unlock_kernel();
 	mutex_lock(&trace_types_lock);
 
 	tracing_selftest_running = true;
@@ -717,7 +822,6 @@ __acquires(kernel_lock)
 #endif
 
  out_unlock:
-	lock_kernel();
 	return ret;
 }
 
@@ -747,10 +851,10 @@ out:
 	mutex_unlock(&trace_types_lock);
 }
 
-static void __tracing_reset(struct trace_array *tr, int cpu)
+static void __tracing_reset(struct ring_buffer *buffer, int cpu)
 {
 	ftrace_disable_cpu();
-	ring_buffer_reset_cpu(tr->buffer, cpu);
+	ring_buffer_reset_cpu(buffer, cpu);
 	ftrace_enable_cpu();
 }
 
@@ -762,7 +866,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
 
 	/* Make sure all commits have finished */
 	synchronize_sched();
-	__tracing_reset(tr, cpu);
+	__tracing_reset(buffer, cpu);
 
 	ring_buffer_record_enable(buffer);
 }
@@ -780,7 +884,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 	tr->time_start = ftrace_now(tr->cpu);
 
 	for_each_online_cpu(cpu)
-		__tracing_reset(tr, cpu);
+		__tracing_reset(buffer, cpu);
 
 	ring_buffer_record_enable(buffer);
 }
@@ -857,6 +961,8 @@ void tracing_start(void)
 		goto out;
 	}
 
+	/* Prevent the buffers from switching */
+	arch_spin_lock(&ftrace_max_lock);
 
 	buffer = global_trace.buffer;
 	if (buffer)
@@ -866,6 +972,8 @@ void tracing_start(void)
 	if (buffer)
 		ring_buffer_record_enable(buffer);
 
+	arch_spin_unlock(&ftrace_max_lock);
+
 	ftrace_start();
  out:
 	spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -887,6 +995,9 @@ void tracing_stop(void)
 	if (trace_stop_count++)
 		goto out;
 
+	/* Prevent the buffers from switching */
+	arch_spin_lock(&ftrace_max_lock);
+
 	buffer = global_trace.buffer;
 	if (buffer)
 		ring_buffer_record_disable(buffer);
@@ -895,6 +1006,8 @@ void tracing_stop(void)
 	if (buffer)
 		ring_buffer_record_disable(buffer);
 
+	arch_spin_unlock(&ftrace_max_lock);
+
  out:
 	spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
@@ -951,6 +1064,11 @@ void trace_find_cmdline(int pid, char comm[])
 		return;
 	}
 
+	if (WARN_ON_ONCE(pid < 0)) {
+		strcpy(comm, "<XXX>");
+		return;
+	}
+
 	if (pid > PID_MAX_DEFAULT) {
 		strcpy(comm, "<...>");
 		return;
@@ -1084,7 +1202,7 @@ trace_function(struct trace_array *tr,
 	struct ftrace_entry *entry;
 
 	/* If we are reading the ring buffer, don't trace */
-	if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+	if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
 		return;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1177,6 +1295,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
 		return;
 
+	/*
+	 * NMIs can not handle page faults, even with fix ups.
+	 * The save user stack can (and often does) fault.
+	 */
+	if (unlikely(in_nmi()))
+		return;
+
 	event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
 					  sizeof(*entry), flags, pc);
 	if (!event)
@@ -1205,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 
 #endif /* CONFIG_STACKTRACE */
 
-static void
-ftrace_trace_special(void *__tr,
-		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
-		     int pc)
-{
-	struct ftrace_event_call *call = &event_special;
-	struct ring_buffer_event *event;
-	struct trace_array *tr = __tr;
-	struct ring_buffer *buffer = tr->buffer;
-	struct special_entry *entry;
-
-	event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
-					  sizeof(*entry), 0, pc);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	entry->arg1			= arg1;
-	entry->arg2			= arg2;
-	entry->arg3			= arg3;
-
-	if (!filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, pc);
-}
-
-void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
-}
-
-void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	int cpu;
-	int pc;
-
-	if (tracing_disabled)
-		return;
-
-	pc = preempt_count();
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (likely(atomic_inc_return(&data->disabled) == 1))
-		ftrace_trace_special(tr, arg1, arg2, arg3, pc);
-
-	atomic_dec(&data->disabled);
-	local_irq_restore(flags);
-}
-
 /**
  * trace_vbprintk - write binary msg to tracing buffer
  *
@@ -1278,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	struct bprint_entry *entry;
 	unsigned long flags;
 	int disable;
-	int resched;
 	int cpu, len = 0, size, pc;
 
 	if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1288,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	pause_graph_tracing();
 
 	pc = preempt_count();
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
@@ -1315,8 +1384,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	entry->fmt			= fmt;
 
 	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-	if (!filter_check_discard(call, entry, buffer, event))
+	if (!filter_check_discard(call, entry, buffer, event)) {
 		ring_buffer_unlock_commit(buffer, event);
+		ftrace_trace_stack(buffer, flags, 6, pc);
+	}
 
 out_unlock:
 	arch_spin_unlock(&trace_buf_lock);
@@ -1324,7 +1395,7 @@ out_unlock:
 
 out:
 	atomic_dec_return(&data->disabled);
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 	unpause_graph_tracing();
 
 	return len;
@@ -1389,8 +1460,10 @@ int trace_array_vprintk(struct trace_array *tr,
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = '\0';
-	if (!filter_check_discard(call, entry, buffer, event))
+	if (!filter_check_discard(call, entry, buffer, event)) {
 		ring_buffer_unlock_commit(buffer, event);
+		ftrace_trace_stack(buffer, irq_flags, 6, pc);
+	}
 
  out_unlock:
 	arch_spin_unlock(&trace_buf_lock);
@@ -1409,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
-enum trace_file_type {
-	TRACE_FILE_LAT_FMT	= 1,
-	TRACE_FILE_ANNOTATE	= 2,
-};
-
 static void trace_iterator_increment(struct trace_iterator *iter)
 {
 	/* Don't allow ftrace to trace into the ring buffers */
@@ -1427,7 +1495,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
 }
 
 static struct trace_entry *
-peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
+peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
+		unsigned long *lost_events)
 {
 	struct ring_buffer_event *event;
 	struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1438,7 +1507,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 	if (buf_iter)
 		event = ring_buffer_iter_peek(buf_iter, ts);
 	else
-		event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+		event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+					 lost_events);
 
 	ftrace_enable_cpu();
 
@@ -1446,10 +1516,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 }
 
 static struct trace_entry *
-__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
+		  unsigned long *missing_events, u64 *ent_ts)
 {
 	struct ring_buffer *buffer = iter->tr->buffer;
 	struct trace_entry *ent, *next = NULL;
+	unsigned long lost_events = 0, next_lost = 0;
 	int cpu_file = iter->cpu_file;
 	u64 next_ts = 0, ts;
 	int next_cpu = -1;
@@ -1462,7 +1534,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 	if (cpu_file > TRACE_PIPE_ALL_CPU) {
 		if (ring_buffer_empty_cpu(buffer, cpu_file))
 			return NULL;
-		ent = peek_next_entry(iter, cpu_file, ent_ts);
+		ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
 		if (ent_cpu)
 			*ent_cpu = cpu_file;
 
@@ -1474,7 +1546,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 		if (ring_buffer_empty_cpu(buffer, cpu))
 			continue;
 
-		ent = peek_next_entry(iter, cpu, &ts);
+		ent = peek_next_entry(iter, cpu, &ts, &lost_events);
 
 		/*
 		 * Pick the entry with the smallest timestamp:
@@ -1483,6 +1555,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 			next = ent;
 			next_cpu = cpu;
 			next_ts = ts;
+			next_lost = lost_events;
 		}
 	}
 
@@ -1492,6 +1565,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 	if (ent_ts)
 		*ent_ts = next_ts;
 
+	if (missing_events)
+		*missing_events = next_lost;
+
 	return next;
 }
 
@@ -1499,13 +1575,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 					  int *ent_cpu, u64 *ent_ts)
 {
-	return __find_next_entry(iter, ent_cpu, ent_ts);
+	return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
 }
 
 /* Find the next real entry, and increment the iterator to the next entry */
-static void *find_next_entry_inc(struct trace_iterator *iter)
+void *trace_find_next_entry_inc(struct trace_iterator *iter)
 {
-	iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
+	iter->ent = __find_next_entry(iter, &iter->cpu,
+				      &iter->lost_events, &iter->ts);
 
 	if (iter->ent)
 		trace_iterator_increment(iter);
@@ -1517,7 +1594,8 @@ static void trace_consume(struct trace_iterator *iter)
 {
 	/* Don't allow ftrace to trace into the ring buffers */
 	ftrace_disable_cpu();
-	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+			    &iter->lost_events);
 	ftrace_enable_cpu();
 }
 
@@ -1536,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 		return NULL;
 
 	if (iter->idx < 0)
-		ent = find_next_entry_inc(iter);
+		ent = trace_find_next_entry_inc(iter);
 	else
 		ent = iter;
 
 	while (ent && iter->idx < i)
-		ent = find_next_entry_inc(iter);
+		ent = trace_find_next_entry_inc(iter);
 
 	iter->pos = *pos;
 
 	return ent;
 }
 
-static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
+void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 {
 	struct trace_array *tr = iter->tr;
 	struct ring_buffer_event *event;
@@ -1580,12 +1658,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 }
 
 /*
- * No necessary locking here. The worst thing which can
- * happen is loosing events consumed at the same time
- * by a trace_pipe reader.
- * Other than that, we don't risk to crash the ring buffer
- * because it serializes the readers.
- *
  * The current tracer is copied to avoid a global locking
  * all around.
  */
@@ -1623,6 +1695,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
 		ftrace_enable_cpu();
 
+		iter->leftover = 0;
 		for (p = iter; p && l < *pos; p = s_next(m, p, &l))
 			;
 
@@ -1640,12 +1713,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 	}
 
 	trace_event_read_lock();
+	trace_access_lock(cpu_file);
 	return p;
 }
 
 static void s_stop(struct seq_file *m, void *p)
 {
+	struct trace_iterator *iter = m->private;
+
 	atomic_dec(&trace_record_cmdline_disabled);
+	trace_access_unlock(iter->cpu_file);
 	trace_event_read_unlock();
 }
 
@@ -1669,7 +1746,7 @@ static void print_func_help_header(struct seq_file *m)
 }
 
 
-static void
+void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 {
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1797,7 +1874,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	}
 
 	if (event)
-		return event->trace(iter, sym_flags);
+		return event->funcs->trace(iter, sym_flags, event);
 
 	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
 		goto partial;
@@ -1823,7 +1900,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event)
-		return event->raw(iter, 0);
+		return event->funcs->raw(iter, 0, event);
 
 	if (!trace_seq_printf(s, "%d ?\n", entry->type))
 		goto partial;
@@ -1850,7 +1927,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event) {
-		enum print_line_t ret = event->hex(iter, 0);
+		enum print_line_t ret = event->funcs->hex(iter, 0, event);
 		if (ret != TRACE_TYPE_HANDLED)
 			return ret;
 	}
@@ -1875,10 +1952,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
+	return event ? event->funcs->binary(iter, 0, event) :
+		TRACE_TYPE_HANDLED;
 }
 
-static int trace_empty(struct trace_iterator *iter)
+int trace_empty(struct trace_iterator *iter)
 {
 	int cpu;
 
@@ -1909,10 +1987,14 @@ static int trace_empty(struct trace_iterator *iter)
 }
 
 /*  Called with trace_event_read_lock() held. */
-static enum print_line_t print_trace_line(struct trace_iterator *iter)
+enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
 	enum print_line_t ret;
 
+	if (iter->lost_events)
+		trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+				 iter->cpu, iter->lost_events);
+
 	if (iter->trace && iter->trace->print_line) {
 		ret = iter->trace->print_line(iter);
 		if (ret != TRACE_TYPE_UNHANDLED)
@@ -1941,6 +2023,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 	return print_trace_fmt(iter);
 }
 
+void trace_default_header(struct seq_file *m)
+{
+	struct trace_iterator *iter = m->private;
+
+	if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+		/* print nothing if the buffers are empty */
+		if (trace_empty(iter))
+			return;
+		print_trace_header(m, iter);
+		if (!(trace_flags & TRACE_ITER_VERBOSE))
+			print_lat_help_header(m);
+	} else {
+		if (!(trace_flags & TRACE_ITER_VERBOSE))
+			print_func_help_header(m);
+	}
+}
+
 static int s_show(struct seq_file *m, void *v)
 {
 	struct trace_iterator *iter = v;
@@ -1953,17 +2052,9 @@ static int s_show(struct seq_file *m, void *v)
 		}
 		if (iter->trace && iter->trace->print_header)
 			iter->trace->print_header(m);
-		else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
-			/* print nothing if the buffers are empty */
-			if (trace_empty(iter))
-				return 0;
-			print_trace_header(m, iter);
-			if (!(trace_flags & TRACE_ITER_VERBOSE))
-				print_lat_help_header(m);
-		} else {
-			if (!(trace_flags & TRACE_ITER_VERBOSE))
-				print_func_help_header(m);
-		}
+		else
+			trace_default_header(m);
+
 	} else if (iter->leftover) {
 		/*
 		 * If we filled the seq_file buffer earlier, we
@@ -2049,15 +2140,20 @@ __tracing_open(struct inode *inode, struct file *file)
 
 	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
 		for_each_tracing_cpu(cpu) {
-
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_start(iter->tr->buffer, cpu);
+				ring_buffer_read_prepare(iter->tr->buffer, cpu);
+		}
+		ring_buffer_read_prepare_sync();
+		for_each_tracing_cpu(cpu) {
+			ring_buffer_read_start(iter->buffer_iter[cpu]);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-				ring_buffer_read_start(iter->tr->buffer, cpu);
+			ring_buffer_read_prepare(iter->tr->buffer, cpu);
+		ring_buffer_read_prepare_sync();
+		ring_buffer_read_start(iter->buffer_iter[cpu]);
 		tracing_iter_reset(iter, cpu);
 	}
 
@@ -2236,6 +2332,7 @@ static const struct file_operations show_traces_fops = {
 	.open		= show_traces_open,
 	.read		= seq_read,
 	.release	= seq_release,
+	.llseek		= seq_lseek,
 };
 
 /*
@@ -2329,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_cpumask_read,
 	.write		= tracing_cpumask_write,
+	.llseek		= generic_file_llseek,
 };
 
 static int tracing_trace_options_show(struct seq_file *m, void *v)
@@ -2404,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
 		trace_flags |= mask;
 	else
 		trace_flags &= ~mask;
+
+	if (mask == TRACE_ITER_RECORD_CMD)
+		trace_event_enable_cmd_record(enabled);
 }
 
 static ssize_t
@@ -2495,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
 static const struct file_operations tracing_readme_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_readme_read,
+	.llseek		= generic_file_llseek,
 };
 
 static ssize_t
@@ -2545,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
 static const struct file_operations tracing_saved_cmdlines_fops = {
     .open       = tracing_open_generic,
     .read       = tracing_saved_cmdlines_read,
+    .llseek	= generic_file_llseek,
 };
 
 static ssize_t
@@ -2640,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size)
 	if (ret < 0)
 		return ret;
 
+	if (!current_trace->use_max_tr)
+		goto out;
+
 	ret = ring_buffer_resize(max_tr.buffer, size);
 	if (ret < 0) {
 		int r;
@@ -2667,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size)
 		return ret;
 	}
 
+	max_tr.entries = size;
+ out:
 	global_trace.entries = size;
 
 	return ret;
 }
 
+
 /**
  * tracing_update_buffers - used by tracing facility to expand ring buffers
  *
@@ -2732,12 +2841,26 @@ static int tracing_set_tracer(const char *buf)
 	trace_branch_disable();
 	if (current_trace && current_trace->reset)
 		current_trace->reset(tr);
-
+	if (current_trace && current_trace->use_max_tr) {
+		/*
+		 * We don't free the ring buffer. instead, resize it because
+		 * The max_tr ring buffer has some state (e.g. ring->clock) and
+		 * we want preserve it.
+		 */
+		ring_buffer_resize(max_tr.buffer, 1);
+		max_tr.entries = 1;
+	}
 	destroy_trace_option_files(topts);
 
 	current_trace = t;
 
 	topts = create_trace_option_files(current_trace);
+	if (current_trace->use_max_tr) {
+		ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
+		if (ret < 0)
+			goto out;
+		max_tr.entries = global_trace.entries;
+	}
 
 	if (t->init) {
 		ret = tracer_init(t, tr);
@@ -2836,22 +2959,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 
 	mutex_lock(&trace_types_lock);
 
-	/* We only allow one reader per cpu */
-	if (cpu_file == TRACE_PIPE_ALL_CPU) {
-		if (!cpumask_empty(tracing_reader_cpumask)) {
-			ret = -EBUSY;
-			goto out;
-		}
-		cpumask_setall(tracing_reader_cpumask);
-	} else {
-		if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
-			cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
-		else {
-			ret = -EBUSY;
-			goto out;
-		}
-	}
-
 	/* create a buffer to store the information to pass to userspace */
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter) {
@@ -2890,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	if (iter->trace->pipe_open)
 		iter->trace->pipe_open(iter);
 
+	nonseekable_open(inode, filp);
 out:
 	mutex_unlock(&trace_types_lock);
 	return ret;
@@ -2907,12 +3015,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 
 	mutex_lock(&trace_types_lock);
 
-	if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
-		cpumask_clear(tracing_reader_cpumask);
-	else
-		cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
-
-
 	if (iter->trace->pipe_close)
 		iter->trace->pipe_close(iter);
 
@@ -3074,7 +3176,8 @@ waitagain:
 	iter->pos = -1;
 
 	trace_event_read_lock();
-	while (find_next_entry_inc(iter) != NULL) {
+	trace_access_lock(iter->cpu_file);
+	while (trace_find_next_entry_inc(iter) != NULL) {
 		enum print_line_t ret;
 		int len = iter->seq.len;
 
@@ -3090,6 +3193,7 @@ waitagain:
 		if (iter->seq.len >= cnt)
 			break;
 	}
+	trace_access_unlock(iter->cpu_file);
 	trace_event_read_unlock();
 
 	/* Now copy what we have to the user */
@@ -3156,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
 		if (ret != TRACE_TYPE_NO_CONSUME)
 			trace_consume(iter);
 		rem -= count;
-		if (!find_next_entry_inc(iter))	{
+		if (!trace_find_next_entry_inc(iter))	{
 			rem = 0;
 			iter->ent = NULL;
 			break;
@@ -3172,12 +3276,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 					size_t len,
 					unsigned int flags)
 {
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages_def[PIPE_DEF_BUFFERS];
+	struct partial_page partial_def[PIPE_DEF_BUFFERS];
 	struct trace_iterator *iter = filp->private_data;
 	struct splice_pipe_desc spd = {
-		.pages		= pages,
-		.partial	= partial,
+		.pages		= pages_def,
+		.partial	= partial_def,
 		.nr_pages	= 0, /* This gets updated below. */
 		.flags		= flags,
 		.ops		= &tracing_pipe_buf_ops,
@@ -3188,6 +3292,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	size_t rem;
 	unsigned int i;
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
 	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
 	if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3209,46 +3316,50 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	if (ret <= 0)
 		goto out_err;
 
-	if (!iter->ent && !find_next_entry_inc(iter)) {
+	if (!iter->ent && !trace_find_next_entry_inc(iter)) {
 		ret = -EFAULT;
 		goto out_err;
 	}
 
 	trace_event_read_lock();
+	trace_access_lock(iter->cpu_file);
 
 	/* Fill as many pages as possible. */
-	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
-		pages[i] = alloc_page(GFP_KERNEL);
-		if (!pages[i])
+	for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
+		spd.pages[i] = alloc_page(GFP_KERNEL);
+		if (!spd.pages[i])
 			break;
 
 		rem = tracing_fill_pipe_page(rem, iter);
 
 		/* Copy the data into the page, so we can start over. */
 		ret = trace_seq_to_buffer(&iter->seq,
-					  page_address(pages[i]),
+					  page_address(spd.pages[i]),
 					  iter->seq.len);
 		if (ret < 0) {
-			__free_page(pages[i]);
+			__free_page(spd.pages[i]);
 			break;
 		}
-		partial[i].offset = 0;
-		partial[i].len = iter->seq.len;
+		spd.partial[i].offset = 0;
+		spd.partial[i].len = iter->seq.len;
 
 		trace_seq_init(&iter->seq);
 	}
 
+	trace_access_unlock(iter->cpu_file);
 	trace_event_read_unlock();
 	mutex_unlock(&iter->mutex);
 
 	spd.nr_pages = i;
 
-	return splice_to_pipe(pipe, &spd);
+	ret = splice_to_pipe(pipe, &spd);
+out:
+	splice_shrink_spd(pipe, &spd);
+	return ret;
 
 out_err:
 	mutex_unlock(&iter->mutex);
-
-	return ret;
+	goto out;
 }
 
 static ssize_t
@@ -3332,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	}
 
 	tracing_start();
-	max_tr.entries = global_trace.entries;
 	mutex_unlock(&trace_types_lock);
 
 	return cnt;
@@ -3353,6 +3463,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 					size_t cnt, loff_t *fpos)
 {
 	char *buf;
+	size_t written;
 
 	if (tracing_disabled)
 		return -EINVAL;
@@ -3374,11 +3485,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	} else
 		buf[cnt] = '\0';
 
-	cnt = mark_printk("%s", buf);
+	written = mark_printk("%s", buf);
 	kfree(buf);
-	*fpos += cnt;
+	*fpos += written;
 
-	return cnt;
+	/* don't tell userspace we wrote more - it might confuse them */
+	if (written > cnt)
+		written = cnt;
+
+	return written;
 }
 
 static int tracing_clock_show(struct seq_file *m, void *v)
@@ -3445,18 +3560,21 @@ static const struct file_operations tracing_max_lat_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_max_lat_read,
 	.write		= tracing_max_lat_write,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct file_operations tracing_ctrl_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_ctrl_read,
 	.write		= tracing_ctrl_write,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct file_operations set_tracer_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_set_trace_read,
 	.write		= tracing_set_trace_write,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct file_operations tracing_pipe_fops = {
@@ -3465,17 +3583,20 @@ static const struct file_operations tracing_pipe_fops = {
 	.read		= tracing_read_pipe,
 	.splice_read	= tracing_splice_read_pipe,
 	.release	= tracing_release_pipe,
+	.llseek		= no_llseek,
 };
 
 static const struct file_operations tracing_entries_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_entries_read,
 	.write		= tracing_entries_write,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct file_operations tracing_mark_fops = {
 	.open		= tracing_open_generic,
 	.write		= tracing_mark_write,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct file_operations trace_clock_fops = {
@@ -3521,7 +3642,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 		     size_t count, loff_t *ppos)
 {
 	struct ftrace_buffer_info *info = filp->private_data;
-	unsigned int pos;
 	ssize_t ret;
 	size_t size;
 
@@ -3539,18 +3659,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 
 	info->read = 0;
 
+	trace_access_lock(info->cpu);
 	ret = ring_buffer_read_page(info->tr->buffer,
 				    &info->spare,
 				    count,
 				    info->cpu, 0);
+	trace_access_unlock(info->cpu);
 	if (ret < 0)
 		return 0;
 
-	pos = ring_buffer_page_len(info->spare);
-
-	if (pos < PAGE_SIZE)
-		memset(info->spare + pos, 0, PAGE_SIZE - pos);
-
 read:
 	size = PAGE_SIZE - info->read;
 	if (size > count)
@@ -3645,11 +3762,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			    unsigned int flags)
 {
 	struct ftrace_buffer_info *info = file->private_data;
-	struct partial_page partial[PIPE_BUFFERS];
-	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial_def[PIPE_DEF_BUFFERS];
+	struct page *pages_def[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
-		.pages		= pages,
-		.partial	= partial,
+		.pages		= pages_def,
+		.partial	= partial_def,
 		.flags		= flags,
 		.ops		= &buffer_pipe_buf_ops,
 		.spd_release	= buffer_spd_release,
@@ -3658,21 +3775,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	int entries, size, i;
 	size_t ret;
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
 	if (*ppos & (PAGE_SIZE - 1)) {
 		WARN_ONCE(1, "Ftrace: previous read must page-align\n");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out;
 	}
 
 	if (len & (PAGE_SIZE - 1)) {
 		WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
-		if (len < PAGE_SIZE)
-			return -EINVAL;
+		if (len < PAGE_SIZE) {
+			ret = -EINVAL;
+			goto out;
+		}
 		len &= PAGE_MASK;
 	}
 
+	trace_access_lock(info->cpu);
 	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
 
-	for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
+	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
 		int r;
 
@@ -3717,6 +3841,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
 	}
 
+	trace_access_unlock(info->cpu);
 	spd.nr_pages = i;
 
 	/* did we read anything? */
@@ -3726,11 +3851,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		else
 			ret = 0;
 		/* TODO: block */
-		return ret;
+		goto out;
 	}
 
 	ret = splice_to_pipe(pipe, &spd);
-
+	splice_shrink_spd(pipe, &spd);
+out:
 	return ret;
 }
 
@@ -3776,6 +3902,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 static const struct file_operations tracing_stats_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_stats_read,
+	.llseek		= generic_file_llseek,
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -3812,6 +3939,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 static const struct file_operations tracing_dyn_info_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_read_dyn_info,
+	.llseek		= generic_file_llseek,
 };
 #endif
 
@@ -3965,6 +4093,7 @@ static const struct file_operations trace_options_fops = {
 	.open = tracing_open_generic,
 	.read = trace_options_read,
 	.write = trace_options_write,
+	.llseek	= generic_file_llseek,
 };
 
 static ssize_t
@@ -4016,6 +4145,7 @@ static const struct file_operations trace_options_core_fops = {
 	.open = tracing_open_generic,
 	.read = trace_options_core_read,
 	.write = trace_options_core_write,
+	.llseek = generic_file_llseek,
 };
 
 struct dentry *trace_create_file(const char *name,
@@ -4153,6 +4283,8 @@ static __init int tracer_init_debugfs(void)
 	struct dentry *d_tracer;
 	int cpu;
 
+	trace_access_lock_init();
+
 	d_tracer = tracing_init_dentry();
 
 	trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4176,10 +4308,10 @@ static __init int tracer_init_debugfs(void)
 #ifdef CONFIG_TRACER_MAX_TRACE
 	trace_create_file("tracing_max_latency", 0644, d_tracer,
 			&tracing_max_latency, &tracing_max_lat_fops);
+#endif
 
 	trace_create_file("tracing_thresh", 0644, d_tracer,
 			&tracing_thresh, &tracing_max_lat_fops);
-#endif
 
 	trace_create_file("README", 0444, d_tracer,
 			NULL, &tracing_readme_fops);
@@ -4203,9 +4335,6 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
-#ifdef CONFIG_SYSPROF_TRACER
-	init_tracer_sysprof_debugfs(d_tracer);
-#endif
 
 	create_trace_options_dir();
 
@@ -4219,7 +4348,7 @@ static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
 	if (ftrace_dump_on_oops)
-		ftrace_dump();
+		ftrace_dump(ftrace_dump_on_oops);
 	return NOTIFY_OK;
 }
 
@@ -4236,7 +4365,7 @@ static int trace_die_handler(struct notifier_block *self,
 	switch (val) {
 	case DIE_OOPS:
 		if (ftrace_dump_on_oops)
-			ftrace_dump();
+			ftrace_dump(ftrace_dump_on_oops);
 		break;
 	default:
 		break;
@@ -4262,7 +4391,7 @@ static struct notifier_block trace_die_notifier = {
  */
 #define KERN_TRACE		KERN_EMERG
 
-static void
+void
 trace_printk_seq(struct trace_seq *s)
 {
 	/* Probably should print a warning here. */
@@ -4277,7 +4406,15 @@ trace_printk_seq(struct trace_seq *s)
 	trace_seq_init(s);
 }
 
-static void __ftrace_dump(bool disable_tracing)
+void trace_init_global_iter(struct trace_iterator *iter)
+{
+	iter->tr = &global_trace;
+	iter->trace = current_trace;
+	iter->cpu_file = TRACE_PIPE_ALL_CPU;
+}
+
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 {
 	static arch_spinlock_t ftrace_dump_lock =
 		(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4301,8 +4438,10 @@ static void __ftrace_dump(bool disable_tracing)
 	if (disable_tracing)
 		ftrace_kill();
 
+	trace_init_global_iter(&iter);
+
 	for_each_tracing_cpu(cpu) {
-		atomic_inc(&global_trace.data[cpu]->disabled);
+		atomic_inc(&iter.tr->data[cpu]->disabled);
 	}
 
 	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4310,12 +4449,25 @@ static void __ftrace_dump(bool disable_tracing)
 	/* don't look at user memory in panic mode */
 	trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
 
-	printk(KERN_TRACE "Dumping ftrace buffer:\n");
-
 	/* Simulate the iterator */
 	iter.tr = &global_trace;
 	iter.trace = current_trace;
-	iter.cpu_file = TRACE_PIPE_ALL_CPU;
+
+	switch (oops_dump_mode) {
+	case DUMP_ALL:
+		iter.cpu_file = TRACE_PIPE_ALL_CPU;
+		break;
+	case DUMP_ORIG:
+		iter.cpu_file = raw_smp_processor_id();
+		break;
+	case DUMP_NONE:
+		goto out_enable;
+	default:
+		printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+		iter.cpu_file = TRACE_PIPE_ALL_CPU;
+	}
+
+	printk(KERN_TRACE "Dumping ftrace buffer:\n");
 
 	/*
 	 * We need to stop all tracing on all CPUS to read the
@@ -4338,7 +4490,7 @@ static void __ftrace_dump(bool disable_tracing)
 		iter.iter_flags |= TRACE_FILE_LAT_FMT;
 		iter.pos = -1;
 
-		if (find_next_entry_inc(&iter) != NULL) {
+		if (trace_find_next_entry_inc(&iter) != NULL) {
 			int ret;
 
 			ret = print_trace_line(&iter);
@@ -4354,12 +4506,13 @@ static void __ftrace_dump(bool disable_tracing)
 	else
 		printk(KERN_TRACE "---------------------------------\n");
 
+ out_enable:
 	/* Re-enable tracing if requested */
 	if (!disable_tracing) {
 		trace_flags |= old_userobj;
 
 		for_each_tracing_cpu(cpu) {
-			atomic_dec(&global_trace.data[cpu]->disabled);
+			atomic_dec(&iter.tr->data[cpu]->disabled);
 		}
 		tracing_on();
 	}
@@ -4370,9 +4523,9 @@ static void __ftrace_dump(bool disable_tracing)
 }
 
 /* By default: disable tracing after the dump */
-void ftrace_dump(void)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 {
-	__ftrace_dump(true);
+	__ftrace_dump(true, oops_dump_mode);
 }
 
 __init static int tracer_alloc_buffers(void)
@@ -4387,9 +4540,6 @@ __init static int tracer_alloc_buffers(void)
 	if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
 		goto out_free_buffer_mask;
 
-	if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
-		goto out_free_tracing_cpumask;
-
 	/* To save memory, keep the ring buffer size to its minimum */
 	if (ring_buffer_expanded)
 		ring_buf_size = trace_buf_size;
@@ -4411,16 +4561,14 @@ __init static int tracer_alloc_buffers(void)
 
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	max_tr.buffer = ring_buffer_alloc(ring_buf_size,
-					     TRACE_BUFFER_FLAGS);
+	max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
 	if (!max_tr.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
 		WARN_ON(1);
 		ring_buffer_free(global_trace.buffer);
 		goto out_free_cpumask;
 	}
-	max_tr.entries = ring_buffer_size(max_tr.buffer);
-	WARN_ON(max_tr.entries != global_trace.entries);
+	max_tr.entries = 1;
 #endif
 
 	/* Allocate the first page for all buffers */
@@ -4433,9 +4581,6 @@ __init static int tracer_alloc_buffers(void)
 
 	register_tracer(&nop_trace);
 	current_trace = &nop_trace;
-#ifdef CONFIG_BOOT_TRACER
-	register_tracer(&boot_tracer);
-#endif
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
 
@@ -4447,8 +4592,6 @@ __init static int tracer_alloc_buffers(void)
 	return 0;
 
 out_free_cpumask:
-	free_cpumask_var(tracing_reader_cpumask);
-out_free_tracing_cpumask:
 	free_cpumask_var(tracing_cpumask);
 out_free_buffer_mask:
 	free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4df6a77eb19..d39b3c5454a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,10 +9,7 @@
 #include <linux/mmiotrace.h>
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
-#include <trace/boot.h>
-#include <linux/kmemtrace.h>
 #include <linux/hw_breakpoint.h>
-
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
 
@@ -25,31 +22,17 @@ enum trace_type {
 	TRACE_STACK,
 	TRACE_PRINT,
 	TRACE_BPRINT,
-	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
 	TRACE_BRANCH,
-	TRACE_BOOT_CALL,
-	TRACE_BOOT_RET,
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
-	TRACE_HW_BRANCHES,
-	TRACE_KMEM_ALLOC,
-	TRACE_KMEM_FREE,
 	TRACE_BLK,
-	TRACE_KSYM,
 
 	__TRACE_LAST_TYPE,
 };
 
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
-
-extern struct tracer boot_tracer;
 
 #undef __field
 #define __field(type, item)		type	item;
@@ -103,29 +86,17 @@ struct syscall_trace_exit {
 	long			ret;
 };
 
-struct kprobe_trace_entry {
+struct kprobe_trace_entry_head {
 	struct trace_entry	ent;
 	unsigned long		ip;
-	int			nargs;
-	unsigned long		args[];
 };
 
-#define SIZEOF_KPROBE_TRACE_ENTRY(n)			\
-	(offsetof(struct kprobe_trace_entry, args) +	\
-	(sizeof(unsigned long) * (n)))
-
-struct kretprobe_trace_entry {
+struct kretprobe_trace_entry_head {
 	struct trace_entry	ent;
 	unsigned long		func;
 	unsigned long		ret_ip;
-	int			nargs;
-	unsigned long		args[];
 };
 
-#define SIZEOF_KRETPROBE_TRACE_ENTRY(n)			\
-	(offsetof(struct kretprobe_trace_entry, args) +	\
-	(sizeof(unsigned long) * (n)))
-
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -217,24 +188,15 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
 		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
-		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 			  TRACE_MMIO_MAP);				\
-		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
-		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
 		IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry,	\
 			  TRACE_GRAPH_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
-		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
-		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
-			  TRACE_KMEM_ALLOC);	\
-		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
-			  TRACE_KMEM_FREE);	\
-		IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -312,6 +274,7 @@ struct tracer {
 	struct tracer		*next;
 	int			print_max;
 	struct tracer_flags	*flags;
+	int			use_max_tr;
 };
 
 
@@ -332,7 +295,6 @@ struct dentry *trace_create_file(const char *name,
 				 const struct file_operations *fops);
 
 struct dentry *tracing_init_dentry(void);
-void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
 struct ring_buffer_event;
 
@@ -352,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 					  int *ent_cpu, u64 *ent_ts);
 
+int trace_empty(struct trace_iterator *iter);
+
+void *trace_find_next_entry_inc(struct trace_iterator *iter);
+
+void trace_init_global_iter(struct trace_iterator *iter);
+
+void tracing_iter_reset(struct trace_iterator *iter, int cpu);
+
 void default_wait_pipe(struct trace_iterator *iter);
 void poll_wait_pipe(struct trace_iterator *iter);
 
@@ -369,15 +339,13 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
 				struct task_struct *wakee,
 				struct task_struct *cur,
 				unsigned long flags, int pc);
-void trace_special(struct trace_array *tr,
-		   struct trace_array_cpu *data,
-		   unsigned long arg1,
-		   unsigned long arg2,
-		   unsigned long arg3, int pc);
 void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
+void trace_default_header(struct seq_file *m);
+void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
+int trace_empty(struct trace_iterator *iter);
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -391,14 +359,22 @@ void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 int is_tracing_stopped(void);
+enum trace_file_type {
+	TRACE_FILE_LAT_FMT	= 1,
+	TRACE_FILE_ANNOTATE	= 2,
+};
+
+extern cpumask_var_t __read_mostly tracing_buffer_mask;
 
-extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
+#define for_each_tracing_cpu(cpu)	\
+	for_each_cpu(cpu, tracing_buffer_mask)
 
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
+extern unsigned long tracing_thresh;
+
 #ifdef CONFIG_TRACER_MAX_TRACE
 extern unsigned long tracing_max_latency;
-extern unsigned long tracing_thresh;
 
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
@@ -415,12 +391,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
 		   int pc);
 #else
-static inline void ftrace_trace_stack(struct trace_array *tr,
+static inline void ftrace_trace_stack(struct ring_buffer *buffer,
 				      unsigned long flags, int skip, int pc)
 {
 }
 
-static inline void ftrace_trace_userstack(struct trace_array *tr,
+static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
 					  unsigned long flags, int pc)
 {
 }
@@ -462,14 +438,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
 					 struct trace_array *tr);
 extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 					       struct trace_array *tr);
-extern int trace_selftest_startup_sysprof(struct tracer *trace,
-					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
-extern int trace_selftest_startup_hw_branches(struct tracer *trace,
-					      struct trace_array *tr);
-extern int trace_selftest_startup_ksym(struct tracer *trace,
-					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
@@ -483,6 +453,8 @@ trace_array_vprintk(struct trace_array *tr,
 		    unsigned long ip, const char *fmt, va_list args);
 int trace_array_printk(struct trace_array *tr,
 		       unsigned long ip, const char *fmt, ...);
+void trace_printk_seq(struct trace_seq *s);
+enum print_line_t print_trace_line(struct trace_iterator *iter);
 
 extern unsigned long trace_flags;
 
@@ -490,13 +462,34 @@ extern int trace_clock_id;
 
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+
+/* Flag options */
+#define TRACE_GRAPH_PRINT_OVERRUN       0x1
+#define TRACE_GRAPH_PRINT_CPU           0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD      0x4
+#define TRACE_GRAPH_PRINT_PROC          0x8
+#define TRACE_GRAPH_PRINT_DURATION      0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME      0x20
+
+extern enum print_line_t
+print_graph_function_flags(struct trace_iterator *iter, u32 flags);
+extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
 extern enum print_line_t
 trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
+extern void graph_trace_open(struct trace_iterator *iter);
+extern void graph_trace_close(struct trace_iterator *iter);
+extern int __trace_graph_entry(struct trace_array *tr,
+			       struct ftrace_graph_ent *trace,
+			       unsigned long flags, int pc);
+extern void __trace_graph_return(struct trace_array *tr,
+				 struct ftrace_graph_ret *trace,
+				 unsigned long flags, int pc);
+
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
 #define FTRACE_GRAPH_MAX_FUNCS		32
+extern int ftrace_graph_filter_enabled;
 extern int ftrace_graph_count;
 extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
 
@@ -504,7 +497,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
 {
 	int i;
 
-	if (!ftrace_graph_count || test_tsk_trace_graph(current))
+	if (!ftrace_graph_filter_enabled)
 		return 1;
 
 	for (i = 0; i < ftrace_graph_count; i++) {
@@ -522,7 +515,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
 	return TRACE_TYPE_UNHANDLED;
 }
@@ -549,7 +542,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
  * struct trace_parser - servers for reading the user input separated by spaces
  * @cont: set if the input is not complete - no final space char was found
  * @buffer: holds the parsed user input
- * @idx: user input lenght
+ * @idx: user input length
  * @size: buffer size
  */
 struct trace_parser {
@@ -608,6 +601,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_LATENCY_FMT		= 0x20000,
 	TRACE_ITER_SLEEP_TIME		= 0x40000,
 	TRACE_ITER_GRAPH_TIME		= 0x80000,
+	TRACE_ITER_RECORD_CMD		= 0x100000,
 };
 
 /*
@@ -619,54 +613,6 @@ enum trace_iterator_flags {
 
 extern struct tracer nop_trace;
 
-/**
- * ftrace_preempt_disable - disable preemption scheduler safe
- *
- * When tracing can happen inside the scheduler, there exists
- * cases that the tracing might happen before the need_resched
- * flag is checked. If this happens and the tracer calls
- * preempt_enable (after a disable), a schedule might take place
- * causing an infinite recursion.
- *
- * To prevent this, we read the need_resched flag before
- * disabling preemption. When we want to enable preemption we
- * check the flag, if it is set, then we call preempt_enable_no_resched.
- * Otherwise, we call preempt_enable.
- *
- * The rational for doing the above is that if need_resched is set
- * and we have yet to reschedule, we are either in an atomic location
- * (where we do not need to check for scheduling) or we are inside
- * the scheduler and do not want to resched.
- */
-static inline int ftrace_preempt_disable(void)
-{
-	int resched;
-
-	resched = need_resched();
-	preempt_disable_notrace();
-
-	return resched;
-}
-
-/**
- * ftrace_preempt_enable - enable preemption scheduler safe
- * @resched: the return value from ftrace_preempt_disable
- *
- * This is a scheduler safe way to enable preemption and not miss
- * any preemption checks. The disabled saved the state of preemption.
- * If resched is set, then we are either inside an atomic or
- * are inside the scheduler (we would have already scheduled
- * otherwise). In this case, we do not want to call normal
- * preempt_enable, but preempt_enable_no_resched instead.
- */
-static inline void ftrace_preempt_enable(int resched)
-{
-	if (resched)
-		preempt_enable_no_resched_notrace();
-	else
-		preempt_enable_notrace();
-}
-
 #ifdef CONFIG_BRANCH_TRACER
 extern int enable_branch_tracing(struct trace_array *tr);
 extern void disable_branch_tracing(void);
@@ -757,6 +703,8 @@ struct filter_pred {
 	int 			pop_n;
 };
 
+extern struct list_head ftrace_common_fields;
+
 extern enum regex_type
 filter_parse_regex(char *buff, int len, char **search, int *not);
 extern void print_event_filter(struct ftrace_event_call *call,
@@ -769,12 +717,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
 					 struct trace_seq *s);
 extern int filter_assign_type(const char *type);
 
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call);
+
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
 		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->filter_active) &&
+	if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
 	    !filter_match_preds(call->filter, rec)) {
 		ring_buffer_discard_commit(buffer, event);
 		return 1;
@@ -783,6 +734,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
+extern void trace_event_enable_cmd_record(bool enable);
+
 extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
@@ -791,7 +744,8 @@ extern const char *__stop___trace_bprintk_fmt[];
 
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print)		\
-	extern struct ftrace_event_call event_##call;
+	extern struct ftrace_event_call					\
+	__attribute__((__aligned__(4))) event_##call;
 #undef FTRACE_ENTRY_DUP
 #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)		\
 	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
deleted file mode 100644
index c21d5f3956a..00000000000
--- a/kernel/trace/trace_boot.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * ring buffer based initcalls tracer
- *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/kallsyms.h>
-#include <linux/time.h>
-
-#include "trace.h"
-#include "trace_output.h"
-
-static struct trace_array *boot_trace;
-static bool pre_initcalls_finished;
-
-/* Tells the boot tracer that the pre_smp_initcalls are finished.
- * So we are ready .
- * It doesn't enable sched events tracing however.
- * You have to call enable_boot_trace to do so.
- */
-void start_boot_trace(void)
-{
-	pre_initcalls_finished = true;
-}
-
-void enable_boot_trace(void)
-{
-	if (boot_trace && pre_initcalls_finished)
-		tracing_start_sched_switch_record();
-}
-
-void disable_boot_trace(void)
-{
-	if (boot_trace && pre_initcalls_finished)
-		tracing_stop_sched_switch_record();
-}
-
-static int boot_trace_init(struct trace_array *tr)
-{
-	boot_trace = tr;
-
-	if (!tr)
-		return 0;
-
-	tracing_reset_online_cpus(tr);
-
-	tracing_sched_switch_assign_trace(tr);
-	return 0;
-}
-
-static enum print_line_t
-initcall_call_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *s = &iter->seq;
-	struct trace_boot_call *field;
-	struct boot_trace_call *call;
-	u64 ts;
-	unsigned long nsec_rem;
-	int ret;
-
-	trace_assign_type(field, entry);
-	call = &field->boot_call;
-	ts = iter->ts;
-	nsec_rem = do_div(ts, NSEC_PER_SEC);
-
-	ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
-			(unsigned long)ts, nsec_rem, call->func, call->caller);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	else
-		return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-initcall_ret_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *s = &iter->seq;
-	struct trace_boot_ret *field;
-	struct boot_trace_ret *init_ret;
-	u64 ts;
-	unsigned long nsec_rem;
-	int ret;
-
-	trace_assign_type(field, entry);
-	init_ret = &field->boot_ret;
-	ts = iter->ts;
-	nsec_rem = do_div(ts, NSEC_PER_SEC);
-
-	ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
-			"returned %d after %llu msecs\n",
-			(unsigned long) ts,
-			nsec_rem,
-			init_ret->func, init_ret->result, init_ret->duration);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	else
-		return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t initcall_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-
-	switch (entry->type) {
-	case TRACE_BOOT_CALL:
-		return initcall_call_print_line(iter);
-	case TRACE_BOOT_RET:
-		return initcall_ret_print_line(iter);
-	default:
-		return TRACE_TYPE_UNHANDLED;
-	}
-}
-
-struct tracer boot_tracer __read_mostly =
-{
-	.name		= "initcall",
-	.init		= boot_trace_init,
-	.reset		= tracing_reset_online_cpus,
-	.print_line	= initcall_print_line,
-};
-
-void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
-{
-	struct ftrace_event_call *call = &event_boot_call;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buffer;
-	struct trace_boot_call *entry;
-	struct trace_array *tr = boot_trace;
-
-	if (!tr || !pre_initcalls_finished)
-		return;
-
-	/* Get its name now since this function could
-	 * disappear because it is in the .init section.
-	 */
-	sprint_symbol(bt->func, (unsigned long)fn);
-	preempt_disable();
-
-	buffer = tr->buffer;
-	event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->boot_call = *bt;
-	if (!filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
-	preempt_enable();
-}
-
-void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
-{
-	struct ftrace_event_call *call = &event_boot_ret;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buffer;
-	struct trace_boot_ret *entry;
-	struct trace_array *tr = boot_trace;
-
-	if (!tr || !pre_initcalls_finished)
-		return;
-
-	sprint_symbol(bt->func, (unsigned long)fn);
-	preempt_disable();
-
-	buffer = tr->buffer;
-	event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->boot_ret = *bt;
-	if (!filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
-	preempt_enable();
-}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88..8d3538b4ea5 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
 }
 
 static enum print_line_t trace_branch_print(struct trace_iterator *iter,
-					    int flags)
+					    int flags, struct trace_event *event)
 {
 	struct trace_branch *field;
 
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
 		"    |\n");
 }
 
+static struct trace_event_functions trace_branch_funcs = {
+	.trace		= trace_branch_print,
+};
+
 static struct trace_event trace_branch_event = {
 	.type		= TRACE_BRANCH,
-	.trace		= trace_branch_print,
+	.funcs		= &trace_branch_funcs,
 };
 
 static struct tracer branch_trace __read_mostly =
@@ -307,8 +311,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
 		return -1;
 	if (percent_a > percent_b)
 		return 1;
-	else
-		return 0;
+
+	if (a->incorrect < b->incorrect)
+		return -1;
+	if (a->incorrect > b->incorrect)
+		return 1;
+
+	/*
+	 * Since the above shows worse (incorrect) cases
+	 * first, we continue that by showing best (correct)
+	 * cases last.
+	 */
+	if (a->correct > b->correct)
+		return -1;
+	if (a->correct < b->correct)
+		return 1;
+
+	return 0;
 }
 
 static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 84a3a7ba072..685a67d55db 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
  * Tracer plugins will chose a default from these clocks.
  */
 #include <linux/spinlock.h>
+#include <linux/irqflags.h>
 #include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
@@ -31,16 +32,15 @@
 u64 notrace trace_clock_local(void)
 {
 	u64 clock;
-	int resched;
 
 	/*
 	 * sched_clock() is an architecture implemented, fast, scalable,
 	 * lockless clock. It is not guaranteed to be coherent across
 	 * CPUs, nor across CPU idle events.
 	 */
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 	clock = sched_clock();
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 
 	return clock;
 }
@@ -55,7 +55,7 @@ u64 notrace trace_clock_local(void)
  */
 u64 notrace trace_clock(void)
 {
-	return cpu_clock(raw_smp_processor_id());
+	return local_clock();
 }
 
 
@@ -83,7 +83,7 @@ u64 notrace trace_clock_global(void)
 	int this_cpu;
 	u64 now;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 
 	this_cpu = raw_smp_processor_id();
 	now = cpu_clock(this_cpu);
@@ -109,7 +109,7 @@ u64 notrace trace_clock_global(void)
 	arch_spin_unlock(&trace_clock_struct.lock);
 
  out:
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 
 	return now;
 }
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399d..e3dfecaf13e 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
 );
 
 /*
- * Special (free-form) trace entry:
- */
-FTRACE_ENTRY(special, special_entry,
-
-	TRACE_SPECIAL,
-
-	F_STRUCT(
-		__field(	unsigned long,	arg1	)
-		__field(	unsigned long,	arg2	)
-		__field(	unsigned long,	arg3	)
-	),
-
-	F_printk("(%08lx) (%08lx) (%08lx)",
-		 __entry->arg1, __entry->arg2, __entry->arg3)
-);
-
-/*
  * Stack-trace entry:
  */
 
@@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
 		 __entry->map_id, __entry->opcode)
 );
 
-FTRACE_ENTRY(boot_call, trace_boot_call,
-
-	TRACE_BOOT_CALL,
-
-	F_STRUCT(
-		__field_struct(	struct boot_trace_call,	boot_call	)
-		__field_desc(	pid_t,	boot_call,	caller		)
-		__array_desc(	char,	boot_call,	func,	KSYM_SYMBOL_LEN)
-	),
-
-	F_printk("%d  %s", __entry->caller, __entry->func)
-);
-
-FTRACE_ENTRY(boot_ret, trace_boot_ret,
-
-	TRACE_BOOT_RET,
-
-	F_STRUCT(
-		__field_struct(	struct boot_trace_ret,	boot_ret	)
-		__array_desc(	char,	boot_ret,	func,	KSYM_SYMBOL_LEN)
-		__field_desc(	int,	boot_ret,	result		)
-		__field_desc(	unsigned long, boot_ret, duration	)
-	),
-
-	F_printk("%s %d %lx",
-		 __entry->func, __entry->result, __entry->duration)
-);
 
 #define TRACE_FUNC_SIZE 30
 #define TRACE_FILE_SIZE 20
@@ -318,65 +274,3 @@ FTRACE_ENTRY(branch, trace_branch,
 		 __entry->func, __entry->file, __entry->correct)
 );
 
-FTRACE_ENTRY(hw_branch, hw_branch_entry,
-
-	TRACE_HW_BRANCHES,
-
-	F_STRUCT(
-		__field(	u64,	from	)
-		__field(	u64,	to	)
-	),
-
-	F_printk("from: %llx to: %llx", __entry->from, __entry->to)
-);
-
-FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
-
-	TRACE_KMEM_ALLOC,
-
-	F_STRUCT(
-		__field(	enum kmemtrace_type_id,	type_id		)
-		__field(	unsigned long,		call_site	)
-		__field(	const void *,		ptr		)
-		__field(	size_t,			bytes_req	)
-		__field(	size_t,			bytes_alloc	)
-		__field(	gfp_t,			gfp_flags	)
-		__field(	int,			node		)
-	),
-
-	F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
-		 " flags:%x node:%d",
-		 __entry->type_id, __entry->call_site, __entry->ptr,
-		 __entry->bytes_req, __entry->bytes_alloc,
-		 __entry->gfp_flags, __entry->node)
-);
-
-FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
-
-	TRACE_KMEM_FREE,
-
-	F_STRUCT(
-		__field(	enum kmemtrace_type_id,	type_id		)
-		__field(	unsigned long,		call_site	)
-		__field(	const void *,		ptr		)
-	),
-
-	F_printk("type:%u call_site:%lx ptr:%p",
-		 __entry->type_id, __entry->call_site, __entry->ptr)
-);
-
-FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
-
-	TRACE_KSYM,
-
-	F_STRUCT(
-		__field(	unsigned long,	ip			  )
-		__field(	unsigned char,	type			  )
-		__array(	char	     ,	cmd,	   TASK_COMM_LEN  )
-		__field(	unsigned long,  addr			  )
-	),
-
-	F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
-		(void *)__entry->ip, (unsigned int)__entry->type,
-		(void *)__entry->addr,  __entry->cmd)
-);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
new file mode 100644
index 00000000000..31cc4cb0dbf
--- /dev/null
+++ b/kernel/trace/trace_event_perf.c
@@ -0,0 +1,183 @@
+/*
+ * trace event based perf event profiling/tracing
+ *
+ * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include "trace.h"
+
+static char *perf_trace_buf[4];
+
+/*
+ * Force it to be aligned to unsigned long to avoid misaligned accesses
+ * suprises
+ */
+typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
+	perf_trace_t;
+
+/* Count the events in use (per event id, not per instance) */
+static int	total_ref_count;
+
+static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+				 struct perf_event *p_event)
+{
+	struct hlist_head *list;
+	int ret = -ENOMEM;
+	int cpu;
+
+	p_event->tp_event = tp_event;
+	if (tp_event->perf_refcount++ > 0)
+		return 0;
+
+	list = alloc_percpu(struct hlist_head);
+	if (!list)
+		goto fail;
+
+	for_each_possible_cpu(cpu)
+		INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
+
+	tp_event->perf_events = list;
+
+	if (!total_ref_count) {
+		char *buf;
+		int i;
+
+		for (i = 0; i < 4; i++) {
+			buf = (char *)alloc_percpu(perf_trace_t);
+			if (!buf)
+				goto fail;
+
+			perf_trace_buf[i] = buf;
+		}
+	}
+
+	ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
+	if (ret)
+		goto fail;
+
+	total_ref_count++;
+	return 0;
+
+fail:
+	if (!total_ref_count) {
+		int i;
+
+		for (i = 0; i < 4; i++) {
+			free_percpu(perf_trace_buf[i]);
+			perf_trace_buf[i] = NULL;
+		}
+	}
+
+	if (!--tp_event->perf_refcount) {
+		free_percpu(tp_event->perf_events);
+		tp_event->perf_events = NULL;
+	}
+
+	return ret;
+}
+
+int perf_trace_init(struct perf_event *p_event)
+{
+	struct ftrace_event_call *tp_event;
+	int event_id = p_event->attr.config;
+	int ret = -EINVAL;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(tp_event, &ftrace_events, list) {
+		if (tp_event->event.type == event_id &&
+		    tp_event->class && tp_event->class->reg &&
+		    try_module_get(tp_event->mod)) {
+			ret = perf_trace_event_init(tp_event, p_event);
+			if (ret)
+				module_put(tp_event->mod);
+			break;
+		}
+	}
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+int perf_trace_enable(struct perf_event *p_event)
+{
+	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct hlist_head *list;
+
+	list = tp_event->perf_events;
+	if (WARN_ON_ONCE(!list))
+		return -EINVAL;
+
+	list = this_cpu_ptr(list);
+	hlist_add_head_rcu(&p_event->hlist_entry, list);
+
+	return 0;
+}
+
+void perf_trace_disable(struct perf_event *p_event)
+{
+	hlist_del_rcu(&p_event->hlist_entry);
+}
+
+void perf_trace_destroy(struct perf_event *p_event)
+{
+	struct ftrace_event_call *tp_event = p_event->tp_event;
+	int i;
+
+	mutex_lock(&event_mutex);
+	if (--tp_event->perf_refcount > 0)
+		goto out;
+
+	tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
+
+	/*
+	 * Ensure our callback won't be called anymore. The buffers
+	 * will be freed after that.
+	 */
+	tracepoint_synchronize_unregister();
+
+	free_percpu(tp_event->perf_events);
+	tp_event->perf_events = NULL;
+
+	if (!--total_ref_count) {
+		for (i = 0; i < 4; i++) {
+			free_percpu(perf_trace_buf[i]);
+			perf_trace_buf[i] = NULL;
+		}
+	}
+out:
+	module_put(tp_event->mod);
+	mutex_unlock(&event_mutex);
+}
+
+__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
+				       struct pt_regs *regs, int *rctxp)
+{
+	struct trace_entry *entry;
+	unsigned long flags;
+	char *raw_data;
+	int pc;
+
+	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
+
+	pc = preempt_count();
+
+	*rctxp = perf_swevent_get_recursion_context();
+	if (*rctxp < 0)
+		return NULL;
+
+	raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
+
+	/* zero the dead bytes from align to not leak stack to user */
+	memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
+
+	entry = (struct trace_entry *)raw_data;
+	local_save_flags(flags);
+	tracing_generic_entry_update(entry, flags, pc);
+	entry->type = type;
+
+	return raw_data;
+}
+EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
deleted file mode 100644
index 9e25573242c..00000000000
--- a/kernel/trace/trace_event_profile.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * trace event based perf counter profiling
- *
- * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
- *
- */
-
-#include <linux/module.h>
-#include "trace.h"
-
-
-char *perf_trace_buf;
-EXPORT_SYMBOL_GPL(perf_trace_buf);
-
-char *perf_trace_buf_nmi;
-EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
-
-typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
-
-/* Count the events in use (per event id, not per instance) */
-static int	total_profile_count;
-
-static int ftrace_profile_enable_event(struct ftrace_event_call *event)
-{
-	char *buf;
-	int ret = -ENOMEM;
-
-	if (event->profile_count++ > 0)
-		return 0;
-
-	if (!total_profile_count) {
-		buf = (char *)alloc_percpu(perf_trace_t);
-		if (!buf)
-			goto fail_buf;
-
-		rcu_assign_pointer(perf_trace_buf, buf);
-
-		buf = (char *)alloc_percpu(perf_trace_t);
-		if (!buf)
-			goto fail_buf_nmi;
-
-		rcu_assign_pointer(perf_trace_buf_nmi, buf);
-	}
-
-	ret = event->profile_enable(event);
-	if (!ret) {
-		total_profile_count++;
-		return 0;
-	}
-
-fail_buf_nmi:
-	if (!total_profile_count) {
-		free_percpu(perf_trace_buf_nmi);
-		free_percpu(perf_trace_buf);
-		perf_trace_buf_nmi = NULL;
-		perf_trace_buf = NULL;
-	}
-fail_buf:
-	event->profile_count--;
-
-	return ret;
-}
-
-int ftrace_profile_enable(int event_id)
-{
-	struct ftrace_event_call *event;
-	int ret = -EINVAL;
-
-	mutex_lock(&event_mutex);
-	list_for_each_entry(event, &ftrace_events, list) {
-		if (event->id == event_id && event->profile_enable &&
-		    try_module_get(event->mod)) {
-			ret = ftrace_profile_enable_event(event);
-			break;
-		}
-	}
-	mutex_unlock(&event_mutex);
-
-	return ret;
-}
-
-static void ftrace_profile_disable_event(struct ftrace_event_call *event)
-{
-	char *buf, *nmi_buf;
-
-	if (--event->profile_count > 0)
-		return;
-
-	event->profile_disable(event);
-
-	if (!--total_profile_count) {
-		buf = perf_trace_buf;
-		rcu_assign_pointer(perf_trace_buf, NULL);
-
-		nmi_buf = perf_trace_buf_nmi;
-		rcu_assign_pointer(perf_trace_buf_nmi, NULL);
-
-		/*
-		 * Ensure every events in profiling have finished before
-		 * releasing the buffers
-		 */
-		synchronize_sched();
-
-		free_percpu(buf);
-		free_percpu(nmi_buf);
-	}
-}
-
-void ftrace_profile_disable(int event_id)
-{
-	struct ftrace_event_call *event;
-
-	mutex_lock(&event_mutex);
-	list_for_each_entry(event, &ftrace_events, list) {
-		if (event->id == event_id) {
-			ftrace_profile_disable_event(event);
-			module_put(event->mod);
-			break;
-		}
-	}
-	mutex_unlock(&event_mutex);
-}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189b09baf4f..4c758f14632 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,6 +15,7 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/slab.h>
 #include <linux/delay.h>
 
 #include <asm/setup.h>
@@ -27,10 +28,19 @@
 DEFINE_MUTEX(event_mutex);
 
 LIST_HEAD(ftrace_events);
+LIST_HEAD(ftrace_common_fields);
 
-int trace_define_field(struct ftrace_event_call *call, const char *type,
-		       const char *name, int offset, int size, int is_signed,
-		       int filter_type)
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call)
+{
+	if (!event_call->class->get_fields)
+		return &event_call->class->fields;
+	return event_call->class->get_fields(event_call);
+}
+
+static int __trace_define_field(struct list_head *head, const char *type,
+				const char *name, int offset, int size,
+				int is_signed, int filter_type)
 {
 	struct ftrace_event_field *field;
 
@@ -55,30 +65,43 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
 	field->size = size;
 	field->is_signed = is_signed;
 
-	list_add(&field->link, &call->fields);
+	list_add(&field->link, head);
 
 	return 0;
 
 err:
-	if (field) {
+	if (field)
 		kfree(field->name);
-		kfree(field->type);
-	}
 	kfree(field);
 
 	return -ENOMEM;
 }
+
+int trace_define_field(struct ftrace_event_call *call, const char *type,
+		       const char *name, int offset, int size, int is_signed,
+		       int filter_type)
+{
+	struct list_head *head;
+
+	if (WARN_ON(!call->class))
+		return 0;
+
+	head = trace_get_fields(call);
+	return __trace_define_field(head, type, name, offset, size,
+				    is_signed, filter_type);
+}
 EXPORT_SYMBOL_GPL(trace_define_field);
 
 #define __common_field(type, item)					\
-	ret = trace_define_field(call, #type, "common_" #item,		\
-				 offsetof(typeof(ent), item),		\
-				 sizeof(ent.item),			\
-				 is_signed_type(type), FILTER_OTHER);	\
+	ret = __trace_define_field(&ftrace_common_fields, #type,	\
+				   "common_" #item,			\
+				   offsetof(typeof(ent), item),		\
+				   sizeof(ent.item),			\
+				   is_signed_type(type), FILTER_OTHER);	\
 	if (ret)							\
 		return ret;
 
-static int trace_define_common_fields(struct ftrace_event_call *call)
+static int trace_define_common_fields(void)
 {
 	int ret;
 	struct trace_entry ent;
@@ -95,8 +118,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
 void trace_destroy_fields(struct ftrace_event_call *call)
 {
 	struct ftrace_event_field *field, *next;
+	struct list_head *head;
 
-	list_for_each_entry_safe(field, next, &call->fields, link) {
+	head = trace_get_fields(call);
+	list_for_each_entry_safe(field, next, head, link) {
 		list_del(&field->link);
 		kfree(field->type);
 		kfree(field->name);
@@ -108,16 +133,63 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 {
 	int id;
 
-	id = register_ftrace_event(call->event);
+	id = register_ftrace_event(&call->event);
 	if (!id)
 		return -ENODEV;
-	call->id = id;
-	INIT_LIST_HEAD(&call->fields);
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
 
+int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
+{
+	switch (type) {
+	case TRACE_REG_REGISTER:
+		return tracepoint_probe_register(call->name,
+						 call->class->probe,
+						 call);
+	case TRACE_REG_UNREGISTER:
+		tracepoint_probe_unregister(call->name,
+					    call->class->probe,
+					    call);
+		return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+	case TRACE_REG_PERF_REGISTER:
+		return tracepoint_probe_register(call->name,
+						 call->class->perf_probe,
+						 call);
+	case TRACE_REG_PERF_UNREGISTER:
+		tracepoint_probe_unregister(call->name,
+					    call->class->perf_probe,
+					    call);
+		return 0;
+#endif
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_reg);
+
+void trace_event_enable_cmd_record(bool enable)
+{
+	struct ftrace_event_call *call;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+			continue;
+
+		if (enable) {
+			tracing_start_cmdline_record();
+			call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+		} else {
+			tracing_stop_cmdline_record();
+			call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+		}
+	}
+	mutex_unlock(&event_mutex);
+}
+
 static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 					int enable)
 {
@@ -125,23 +197,29 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 
 	switch (enable) {
 	case 0:
-		if (call->enabled) {
-			call->enabled = 0;
-			tracing_stop_cmdline_record();
-			call->unregfunc(call);
+		if (call->flags & TRACE_EVENT_FL_ENABLED) {
+			call->flags &= ~TRACE_EVENT_FL_ENABLED;
+			if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
+				tracing_stop_cmdline_record();
+				call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+			}
+			call->class->reg(call, TRACE_REG_UNREGISTER);
 		}
 		break;
 	case 1:
-		if (!call->enabled) {
-			tracing_start_cmdline_record();
-			ret = call->regfunc(call);
+		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
+			if (trace_flags & TRACE_ITER_RECORD_CMD) {
+				tracing_start_cmdline_record();
+				call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+			}
+			ret = call->class->reg(call, TRACE_REG_REGISTER);
 			if (ret) {
 				tracing_stop_cmdline_record();
 				pr_info("event trace: Could not enable event "
 					"%s\n", call->name);
 				break;
 			}
-			call->enabled = 1;
+			call->flags |= TRACE_EVENT_FL_ENABLED;
 		}
 		break;
 	}
@@ -172,15 +250,15 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 
-		if (!call->name || !call->regfunc)
+		if (!call->name || !call->class || !call->class->reg)
 			continue;
 
 		if (match &&
 		    strcmp(match, call->name) != 0 &&
-		    strcmp(match, call->system) != 0)
+		    strcmp(match, call->class->system) != 0)
 			continue;
 
-		if (sub && strcmp(sub, call->system) != 0)
+		if (sub && strcmp(sub, call->class->system) != 0)
 			continue;
 
 		if (event && strcmp(event, call->name) != 0)
@@ -298,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		 * The ftrace subsystem is for showing formats only.
 		 * They can not be enabled or disabled via the event files.
 		 */
-		if (call->regfunc)
+		if (call->class && call->class->reg)
 			return call;
 	}
 
@@ -329,7 +407,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
 	(*pos)++;
 
 	list_for_each_entry_continue(call, &ftrace_events, list) {
-		if (call->enabled)
+		if (call->flags & TRACE_EVENT_FL_ENABLED)
 			return call;
 	}
 
@@ -356,8 +434,8 @@ static int t_show(struct seq_file *m, void *v)
 {
 	struct ftrace_event_call *call = v;
 
-	if (strcmp(call->system, TRACE_SYSTEM) != 0)
-		seq_printf(m, "%s:", call->system);
+	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+		seq_printf(m, "%s:", call->class->system);
 	seq_printf(m, "%s\n", call->name);
 
 	return 0;
@@ -388,7 +466,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_call *call = filp->private_data;
 	char *buf;
 
-	if (call->enabled)
+	if (call->flags & TRACE_EVENT_FL_ENABLED)
 		buf = "1\n";
 	else
 		buf = "0\n";
@@ -451,10 +529,10 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
-		if (!call->name || !call->regfunc)
+		if (!call->name || !call->class || !call->class->reg)
 			continue;
 
-		if (system && strcmp(call->system, system) != 0)
+		if (system && strcmp(call->class->system, system) != 0)
 			continue;
 
 		/*
@@ -462,7 +540,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		 * or if all events or cleared, or if we have
 		 * a mixture.
 		 */
-		set |= (1 << !!call->enabled);
+		set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
 
 		/*
 		 * If we have a mixture, no need to look further.
@@ -520,74 +598,165 @@ out:
 	return ret;
 }
 
-extern char *__bad_type_size(void);
+enum {
+	FORMAT_HEADER		= 1,
+	FORMAT_PRINTFMT		= 2,
+};
 
-#undef FIELD
-#define FIELD(type, name)						\
-	sizeof(type) != sizeof(field.name) ? __bad_type_size() :	\
-	#type, "common_" #name, offsetof(typeof(field), name),		\
-		sizeof(field.name), is_signed_type(type)
+static void *f_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_event_call *call = m->private;
+	struct ftrace_event_field *field;
+	struct list_head *head;
+
+	(*pos)++;
 
-static int trace_write_header(struct trace_seq *s)
+	switch ((unsigned long)v) {
+	case FORMAT_HEADER:
+		head = &ftrace_common_fields;
+
+		if (unlikely(list_empty(head)))
+			return NULL;
+
+		field = list_entry(head->prev, struct ftrace_event_field, link);
+		return field;
+
+	case FORMAT_PRINTFMT:
+		/* all done */
+		return NULL;
+	}
+
+	head = trace_get_fields(call);
+
+	/*
+	 * To separate common fields from event fields, the
+	 * LSB is set on the first event field. Clear it in case.
+	 */
+	v = (void *)((unsigned long)v & ~1L);
+
+	field = v;
+	/*
+	 * If this is a common field, and at the end of the list, then
+	 * continue with main list.
+	 */
+	if (field->link.prev == &ftrace_common_fields) {
+		if (unlikely(list_empty(head)))
+			return NULL;
+		field = list_entry(head->prev, struct ftrace_event_field, link);
+		/* Set the LSB to notify f_show to print an extra newline */
+		field = (struct ftrace_event_field *)
+			((unsigned long)field | 1);
+		return field;
+	}
+
+	/* If we are done tell f_show to print the format */
+	if (field->link.prev == head)
+		return (void *)FORMAT_PRINTFMT;
+
+	field = list_entry(field->link.prev, struct ftrace_event_field, link);
+
+	return field;
+}
+
+static void *f_start(struct seq_file *m, loff_t *pos)
 {
-	struct trace_entry field;
-
-	/* struct trace_entry */
-	return trace_seq_printf(s,
-			"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-			"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-			"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-			"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-			"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-			"\n",
-			FIELD(unsigned short, type),
-			FIELD(unsigned char, flags),
-			FIELD(unsigned char, preempt_count),
-			FIELD(int, pid),
-			FIELD(int, lock_depth));
+	loff_t l = 0;
+	void *p;
+
+	/* Start by showing the header */
+	if (!*pos)
+		return (void *)FORMAT_HEADER;
+
+	p = (void *)FORMAT_HEADER;
+	do {
+		p = f_next(m, p, &l);
+	} while (p && l < *pos);
+
+	return p;
 }
 
-static ssize_t
-event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
-		  loff_t *ppos)
+static int f_show(struct seq_file *m, void *v)
 {
-	struct ftrace_event_call *call = filp->private_data;
-	struct trace_seq *s;
-	char *buf;
-	int r;
+	struct ftrace_event_call *call = m->private;
+	struct ftrace_event_field *field;
+	const char *array_descriptor;
 
-	if (*ppos)
+	switch ((unsigned long)v) {
+	case FORMAT_HEADER:
+		seq_printf(m, "name: %s\n", call->name);
+		seq_printf(m, "ID: %d\n", call->event.type);
+		seq_printf(m, "format:\n");
 		return 0;
 
-	s = kmalloc(sizeof(*s), GFP_KERNEL);
-	if (!s)
-		return -ENOMEM;
+	case FORMAT_PRINTFMT:
+		seq_printf(m, "\nprint fmt: %s\n",
+			   call->print_fmt);
+		return 0;
+	}
 
-	trace_seq_init(s);
+	/*
+	 * To separate common fields from event fields, the
+	 * LSB is set on the first event field. Clear it and
+	 * print a newline if it is set.
+	 */
+	if ((unsigned long)v & 1) {
+		seq_putc(m, '\n');
+		v = (void *)((unsigned long)v & ~1L);
+	}
+
+	field = v;
 
-	/* If any of the first writes fail, so will the show_format. */
+	/*
+	 * Smartly shows the array type(except dynamic array).
+	 * Normal:
+	 *	field:TYPE VAR
+	 * If TYPE := TYPE[LEN], it is shown:
+	 *	field:TYPE VAR[LEN]
+	 */
+	array_descriptor = strchr(field->type, '[');
 
-	trace_seq_printf(s, "name: %s\n", call->name);
-	trace_seq_printf(s, "ID: %d\n", call->id);
-	trace_seq_printf(s, "format:\n");
-	trace_write_header(s);
+	if (!strncmp(field->type, "__data_loc", 10))
+		array_descriptor = NULL;
 
-	r = call->show_format(call, s);
-	if (!r) {
-		/*
-		 * ug!  The format output is bigger than a PAGE!!
-		 */
-		buf = "FORMAT TOO BIG\n";
-		r = simple_read_from_buffer(ubuf, cnt, ppos,
-					      buf, strlen(buf));
-		goto out;
-	}
+	if (!array_descriptor)
+		seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+			   field->type, field->name, field->offset,
+			   field->size, !!field->is_signed);
+	else
+		seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+			   (int)(array_descriptor - field->type),
+			   field->type, field->name,
+			   array_descriptor, field->offset,
+			   field->size, !!field->is_signed);
 
-	r = simple_read_from_buffer(ubuf, cnt, ppos,
-				    s->buffer, s->len);
- out:
-	kfree(s);
-	return r;
+	return 0;
+}
+
+static void f_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations trace_format_seq_ops = {
+	.start		= f_start,
+	.next		= f_next,
+	.stop		= f_stop,
+	.show		= f_show,
+};
+
+static int trace_format_open(struct inode *inode, struct file *file)
+{
+	struct ftrace_event_call *call = inode->i_private;
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &trace_format_seq_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+	m->private = call;
+
+	return 0;
 }
 
 static ssize_t
@@ -605,7 +774,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 		return -ENOMEM;
 
 	trace_seq_init(s);
-	trace_seq_printf(s, "%d\n", call->id);
+	trace_seq_printf(s, "%d\n", call->event.type);
 
 	r = simple_read_from_buffer(ubuf, cnt, ppos,
 				    s->buffer, s->len);
@@ -785,8 +954,10 @@ static const struct file_operations ftrace_enable_fops = {
 };
 
 static const struct file_operations ftrace_event_format_fops = {
-	.open = tracing_open_generic,
-	.read = event_format_read,
+	.open = trace_format_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
 };
 
 static const struct file_operations ftrace_event_id_fops = {
@@ -911,14 +1082,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 		 const struct file_operations *filter,
 		 const struct file_operations *format)
 {
+	struct list_head *head;
 	int ret;
 
 	/*
 	 * If the trace point header did not define TRACE_SYSTEM
 	 * then the system would be called "TRACE_SYSTEM".
 	 */
-	if (strcmp(call->system, TRACE_SYSTEM) != 0)
-		d_events = event_subsystem_dir(call->system, d_events);
+	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+		d_events = event_subsystem_dir(call->class->system, d_events);
 
 	call->dir = debugfs_create_dir(call->name, d_events);
 	if (!call->dir) {
@@ -927,30 +1099,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 		return -1;
 	}
 
-	if (call->regfunc)
+	if (call->class->reg)
 		trace_create_file("enable", 0644, call->dir, call,
 				  enable);
 
-	if (call->id && call->profile_enable)
+#ifdef CONFIG_PERF_EVENTS
+	if (call->event.type && call->class->reg)
 		trace_create_file("id", 0444, call->dir, call,
 		 		  id);
+#endif
 
-	if (call->define_fields) {
-		ret = trace_define_common_fields(call);
-		if (!ret)
-			ret = call->define_fields(call);
+	/*
+	 * Other events may have the same class. Only update
+	 * the fields if they are not already defined.
+	 */
+	head = trace_get_fields(call);
+	if (list_empty(head)) {
+		ret = call->class->define_fields(call);
 		if (ret < 0) {
 			pr_warning("Could not initialize trace point"
 				   " events/%s\n", call->name);
 			return ret;
 		}
-		trace_create_file("filter", 0644, call->dir, call,
-				  filter);
 	}
-
-	/* A trace may not want to export its format */
-	if (!call->show_format)
-		return 0;
+	trace_create_file("filter", 0644, call->dir, call,
+			  filter);
 
 	trace_create_file("format", 0444, call->dir, call,
 			  format);
@@ -958,20 +1131,26 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 	return 0;
 }
 
-static int __trace_add_event_call(struct ftrace_event_call *call)
+static int
+__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
+		       const struct file_operations *id,
+		       const struct file_operations *enable,
+		       const struct file_operations *filter,
+		       const struct file_operations *format)
 {
 	struct dentry *d_events;
 	int ret;
 
+	/* The linker may leave blanks */
 	if (!call->name)
 		return -EINVAL;
 
-	if (call->raw_init) {
-		ret = call->raw_init(call);
+	if (call->class->raw_init) {
+		ret = call->class->raw_init(call);
 		if (ret < 0) {
 			if (ret != -ENOSYS)
-				pr_warning("Could not initialize trace "
-				"events/%s\n", call->name);
+				pr_warning("Could not initialize trace events/%s\n",
+					   call->name);
 			return ret;
 		}
 	}
@@ -980,11 +1159,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
 	if (!d_events)
 		return -ENOENT;
 
-	ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
-				&ftrace_enable_fops, &ftrace_event_filter_fops,
-				&ftrace_event_format_fops);
+	ret = event_create_dir(call, d_events, id, enable, filter, format);
 	if (!ret)
 		list_add(&call->list, &ftrace_events);
+	call->mod = mod;
 
 	return ret;
 }
@@ -994,7 +1172,10 @@ int trace_add_event_call(struct ftrace_event_call *call)
 {
 	int ret;
 	mutex_lock(&event_mutex);
-	ret = __trace_add_event_call(call);
+	ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
+				     &ftrace_enable_fops,
+				     &ftrace_event_filter_fops,
+				     &ftrace_event_format_fops);
 	mutex_unlock(&event_mutex);
 	return ret;
 }
@@ -1031,13 +1212,13 @@ static void remove_subsystem_dir(const char *name)
 static void __trace_remove_event_call(struct ftrace_event_call *call)
 {
 	ftrace_event_enable_disable(call, 0);
-	if (call->event)
-		__unregister_ftrace_event(call->event);
+	if (call->event.funcs)
+		__unregister_ftrace_event(&call->event);
 	debugfs_remove_recursive(call->dir);
 	list_del(&call->list);
 	trace_destroy_fields(call);
 	destroy_preds(call);
-	remove_subsystem_dir(call->system);
+	remove_subsystem_dir(call->class->system);
 }
 
 /* Remove an event_call */
@@ -1111,8 +1292,6 @@ static void trace_module_add_events(struct module *mod)
 {
 	struct ftrace_module_file_ops *file_ops = NULL;
 	struct ftrace_event_call *call, *start, *end;
-	struct dentry *d_events;
-	int ret;
 
 	start = mod->trace_events;
 	end = mod->trace_events + mod->num_trace_events;
@@ -1120,38 +1299,14 @@ static void trace_module_add_events(struct module *mod)
 	if (start == end)
 		return;
 
-	d_events = event_trace_events_dir();
-	if (!d_events)
+	file_ops = trace_create_file_ops(mod);
+	if (!file_ops)
 		return;
 
 	for_each_event(call, start, end) {
-		/* The linker may leave blanks */
-		if (!call->name)
-			continue;
-		if (call->raw_init) {
-			ret = call->raw_init(call);
-			if (ret < 0) {
-				if (ret != -ENOSYS)
-					pr_warning("Could not initialize trace "
-					"point events/%s\n", call->name);
-				continue;
-			}
-		}
-		/*
-		 * This module has events, create file ops for this module
-		 * if not already done.
-		 */
-		if (!file_ops) {
-			file_ops = trace_create_file_ops(mod);
-			if (!file_ops)
-				return;
-		}
-		call->mod = mod;
-		ret = event_create_dir(call, d_events,
+		__trace_add_event_call(call, mod,
 				       &file_ops->id, &file_ops->enable,
 				       &file_ops->filter, &file_ops->format);
-		if (!ret)
-			list_add(&call->list, &ftrace_events);
 	}
 }
 
@@ -1278,25 +1433,14 @@ static __init int event_trace_init(void)
 	trace_create_file("enable", 0644, d_events,
 			  NULL, &ftrace_system_enable_fops);
 
+	if (trace_define_common_fields())
+		pr_warning("tracing: Failed to allocate common fields");
+
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
-		/* The linker may leave blanks */
-		if (!call->name)
-			continue;
-		if (call->raw_init) {
-			ret = call->raw_init(call);
-			if (ret < 0) {
-				if (ret != -ENOSYS)
-					pr_warning("Could not initialize trace "
-					"point events/%s\n", call->name);
-				continue;
-			}
-		}
-		ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
+		__trace_add_event_call(call, NULL, &ftrace_event_id_fops,
 				       &ftrace_enable_fops,
 				       &ftrace_event_filter_fops,
 				       &ftrace_event_format_fops);
-		if (!ret)
-			list_add(&call->list, &ftrace_events);
 	}
 
 	while (true) {
@@ -1384,8 +1528,8 @@ static __init void event_trace_self_tests(void)
 
 	list_for_each_entry(call, &ftrace_events, list) {
 
-		/* Only test those that have a regfunc */
-		if (!call->regfunc)
+		/* Only test those that have a probe */
+		if (!call->class || !call->class->probe)
 			continue;
 
 /*
@@ -1395,8 +1539,8 @@ static __init void event_trace_self_tests(void)
  * syscalls as we test.
  */
 #ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
-		if (call->system &&
-		    strcmp(call->system, "syscalls") == 0)
+		if (call->class->system &&
+		    strcmp(call->class->system, "syscalls") == 0)
 			continue;
 #endif
 
@@ -1406,7 +1550,7 @@ static __init void event_trace_self_tests(void)
 		 * If an event is already enabled, someone is using
 		 * it and the self test should not be on.
 		 */
-		if (call->enabled) {
+		if (call->flags & TRACE_EVENT_FL_ENABLED) {
 			pr_warning("Enabled event during self test!\n");
 			WARN_ON_ONCE(1);
 			continue;
@@ -1483,12 +1627,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
 	struct ftrace_entry *entry;
 	unsigned long flags;
 	long disabled;
-	int resched;
 	int cpu;
 	int pc;
 
 	pc = preempt_count();
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 	cpu = raw_smp_processor_id();
 	disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
 
@@ -1510,7 +1653,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
 
  out:
 	atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 }
 
 static struct ftrace_ops trace_ops __initdata  =
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e42af9aad69..36d40104b17 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,6 +22,7 @@
 #include <linux/ctype.h>
 #include <linux/mutex.h>
 #include <linux/perf_event.h>
+#include <linux/slab.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -496,11 +497,11 @@ void print_subsystem_event_filter(struct event_subsystem *system,
 }
 
 static struct ftrace_event_field *
-find_event_field(struct ftrace_event_call *call, char *name)
+__find_event_field(struct list_head *head, char *name)
 {
 	struct ftrace_event_field *field;
 
-	list_for_each_entry(field, &call->fields, link) {
+	list_for_each_entry(field, head, link) {
 		if (!strcmp(field->name, name))
 			return field;
 	}
@@ -508,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name)
 	return NULL;
 }
 
+static struct ftrace_event_field *
+find_event_field(struct ftrace_event_call *call, char *name)
+{
+	struct ftrace_event_field *field;
+	struct list_head *head;
+
+	field = __find_event_field(&ftrace_common_fields, name);
+	if (field)
+		return field;
+
+	head = trace_get_fields(call);
+	return __find_event_field(head, name);
+}
+
 static void filter_free_pred(struct filter_pred *pred)
 {
 	if (!pred)
@@ -544,7 +559,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
 	struct event_filter *filter = call->filter;
 	int i;
 
-	call->filter_active = 0;
+	call->flags &= ~TRACE_EVENT_FL_FILTERED;
 	filter->n_preds = 0;
 
 	for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -571,7 +586,7 @@ void destroy_preds(struct ftrace_event_call *call)
 {
 	__free_preds(call->filter);
 	call->filter = NULL;
-	call->filter_active = 0;
+	call->flags &= ~TRACE_EVENT_FL_FILTERED;
 }
 
 static struct event_filter *__alloc_preds(void)
@@ -610,7 +625,7 @@ static int init_preds(struct ftrace_event_call *call)
 	if (call->filter)
 		return 0;
 
-	call->filter_active = 0;
+	call->flags &= ~TRACE_EVENT_FL_FILTERED;
 	call->filter = __alloc_preds();
 	if (IS_ERR(call->filter))
 		return PTR_ERR(call->filter);
@@ -624,10 +639,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
 	int err;
 
 	list_for_each_entry(call, &ftrace_events, list) {
-		if (!call->define_fields)
-			continue;
-
-		if (strcmp(call->system, system->name) != 0)
+		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
 		err = init_preds(call);
@@ -643,10 +655,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
 	struct ftrace_event_call *call;
 
 	list_for_each_entry(call, &ftrace_events, list) {
-		if (!call->define_fields)
-			continue;
-
-		if (strcmp(call->system, system->name) != 0)
+		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
 		filter_disable_preds(call);
@@ -1248,10 +1257,7 @@ static int replace_system_preds(struct event_subsystem *system,
 	list_for_each_entry(call, &ftrace_events, list) {
 		struct event_filter *filter = call->filter;
 
-		if (!call->define_fields)
-			continue;
-
-		if (strcmp(call->system, system->name) != 0)
+		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
 		/* try to see if the filter can be applied */
@@ -1265,7 +1271,7 @@ static int replace_system_preds(struct event_subsystem *system,
 		if (err)
 			filter_disable_preds(call);
 		else {
-			call->filter_active = 1;
+			call->flags |= TRACE_EVENT_FL_FILTERED;
 			replace_filter_string(filter, filter_string);
 		}
 		fail = false;
@@ -1314,7 +1320,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 	if (err)
 		append_filter_err(ps, call->filter);
 	else
-		call->filter_active = 1;
+		call->flags |= TRACE_EVENT_FL_FILTERED;
 out:
 	filter_opstack_clear(ps);
 	postfix_clear(ps);
@@ -1371,7 +1377,7 @@ out_unlock:
 	return err;
 }
 
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 
 void ftrace_profile_free_filter(struct perf_event *event)
 {
@@ -1392,12 +1398,12 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	mutex_lock(&event_mutex);
 
 	list_for_each_entry(call, &ftrace_events, list) {
-		if (call->id == event_id)
+		if (call->event.type == event_id)
 			break;
 	}
 
 	err = -EINVAL;
-	if (!call)
+	if (&call->list == &ftrace_events)
 		goto out_unlock;
 
 	err = -EEXIST;
@@ -1439,5 +1445,5 @@ out_unlock:
 	return err;
 }
 
-#endif /* CONFIG_EVENT_PROFILE */
+#endif /* CONFIG_PERF_EVENTS */
 
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4fa5dc1ee4..4ba44deaac2 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void)	\
 
 #include "trace_entries.h"
 
-
-#undef __field
-#define __field(type, item)						\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%zu;\tsize:%zu;\tsigned:%u;\n",	\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item), is_signed_type(type)); \
-	if (!ret)							\
-		return 0;
-
-#undef __field_desc
-#define __field_desc(type, container, item)				\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%zu;\tsize:%zu;\tsigned:%u;\n",	\
-			       offsetof(typeof(field), container.item),	\
-			       sizeof(field.container.item),		\
-			       is_signed_type(type));			\
-	if (!ret)							\
-		return 0;
-
-#undef __array
-#define __array(type, item, len)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-			       "offset:%zu;\tsize:%zu;\tsigned:%u;\n",	\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item), is_signed_type(type)); \
-	if (!ret)							\
-		return 0;
-
-#undef __array_desc
-#define __array_desc(type, container, item, len)			\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-			       "offset:%zu;\tsize:%zu;\tsigned:%u;\n",	\
-			       offsetof(typeof(field), container.item),	\
-			       sizeof(field.container.item),		\
-			       is_signed_type(type));			\
-	if (!ret)							\
-		return 0;
-
-#undef __dynamic_array
-#define __dynamic_array(type, item)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%zu;\tsize:0;\tsigned:%u;\n",	\
-			       offsetof(typeof(field), item),		\
-			       is_signed_type(type));			\
-	if (!ret)							\
-		return 0;
-
-#undef F_printk
-#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-
-#undef __entry
-#define __entry REC
-
-#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)		\
-static int								\
-ftrace_format_##name(struct ftrace_event_call *unused,			\
-		     struct trace_seq *s)				\
-{									\
-	struct struct_name field __attribute__((unused));		\
-	int ret = 0;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: " print);			\
-									\
-	return ret;							\
-}
-
-#include "trace_entries.h"
-
 #undef __field
 #define __field(type, item)						\
 	ret = trace_define_field(event_call, #type, #item,		\
@@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused,			\
 		return ret;
 
 #undef __dynamic_array
-#define __dynamic_array(type, item)
+#define __dynamic_array(type, item)					\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 0, is_signed_type(type), FILTER_OTHER);\
+	if (ret)							\
+		return ret;
 
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(name, struct_name, id, tstruct, print)		\
@@ -192,11 +125,8 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 
 #include "trace_entries.h"
 
-static int ftrace_raw_init_event(struct ftrace_event_call *call)
-{
-	INIT_LIST_HEAD(&call->fields);
-	return 0;
-}
+#undef __entry
+#define __entry REC
 
 #undef __field
 #define __field(type, item)
@@ -213,18 +143,25 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
 #undef __dynamic_array
 #define __dynamic_array(type, item)
 
+#undef F_printk
+#define F_printk(fmt, args...) #fmt ", "  __stringify(args)
+
 #undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, type, tstruct, print)		\
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print)		\
+									\
+struct ftrace_event_class event_class_ftrace_##call = {			\
+	.system			= __stringify(TRACE_SYSTEM),		\
+	.define_fields		= ftrace_define_fields_##call,		\
+	.fields			= LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
+};									\
 									\
 struct ftrace_event_call __used						\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
-	.id			= type,					\
-	.system			= __stringify(TRACE_SYSTEM),		\
-	.raw_init		= ftrace_raw_init_event,		\
-	.show_format		= ftrace_format_##call,			\
-	.define_fields		= ftrace_define_fields_##call,		\
+	.event.type		= etype,				\
+	.class			= &event_class_ftrace_##call,		\
+	.print_fmt		= print,				\
 };									\
 
 #include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3f3776b0cd..16aee4d44e8 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
-	int cpu, resched;
+	int cpu;
 	int pc;
 
 	if (unlikely(!ftrace_function_enabled))
 		return;
 
 	pc = preempt_count();
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 	local_save_flags(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
 		trace_function(tr, ip, parent_ip, flags, pc);
 
 	atomic_dec(&data->disabled);
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 }
 
 static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1342c5d37c..6f233698518 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -9,6 +9,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 
 #include "trace.h"
@@ -18,6 +19,7 @@ struct fgraph_cpu_data {
 	pid_t		last_pid;
 	int		depth;
 	int		ignore;
+	unsigned long	enter_funcs[FTRACE_RETFUNC_DEPTH];
 };
 
 struct fgraph_data {
@@ -38,7 +40,7 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_OVERHEAD	0x4
 #define TRACE_GRAPH_PRINT_PROC		0x8
 #define TRACE_GRAPH_PRINT_DURATION	0x10
-#define TRACE_GRAPH_PRINT_ABS_TIME	0X20
+#define TRACE_GRAPH_PRINT_ABS_TIME	0x20
 
 static struct tracer_opt trace_opts[] = {
 	/* Display overruns? (for self-debug purpose) */
@@ -177,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 	return ret;
 }
 
-static int __trace_graph_entry(struct trace_array *tr,
+int __trace_graph_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
 				unsigned long flags,
 				int pc)
@@ -187,7 +189,7 @@ static int __trace_graph_entry(struct trace_array *tr,
 	struct ring_buffer *buffer = tr->buffer;
 	struct ftrace_graph_ent_entry *entry;
 
-	if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+	if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
 		return 0;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -212,13 +214,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	int cpu;
 	int pc;
 
-	if (unlikely(!tr))
-		return 0;
-
 	if (!ftrace_trace_task(current))
 		return 0;
 
-	if (!ftrace_graph_addr(trace->func))
+	/* trace it when it is-nested-in or is a function enabled. */
+	if (!(trace->depth || ftrace_graph_addr(trace->func)))
 		return 0;
 
 	local_irq_save(flags);
@@ -231,9 +231,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	} else {
 		ret = 0;
 	}
-	/* Only do the atomic if it is not already set */
-	if (!test_tsk_trace_graph(current))
-		set_tsk_trace_graph(current);
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
@@ -241,7 +238,15 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	return ret;
 }
 
-static void __trace_graph_return(struct trace_array *tr,
+int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+{
+	if (tracing_thresh)
+		return 1;
+	else
+		return trace_graph_entry(trace);
+}
+
+void __trace_graph_return(struct trace_array *tr,
 				struct ftrace_graph_ret *trace,
 				unsigned long flags,
 				int pc)
@@ -251,7 +256,7 @@ static void __trace_graph_return(struct trace_array *tr,
 	struct ring_buffer *buffer = tr->buffer;
 	struct ftrace_graph_ret_entry *entry;
 
-	if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+	if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
 		return;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -281,19 +286,39 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 		pc = preempt_count();
 		__trace_graph_return(tr, trace, flags, pc);
 	}
-	if (!trace->depth)
-		clear_tsk_trace_graph(current);
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
 
+void set_graph_array(struct trace_array *tr)
+{
+	graph_array = tr;
+
+	/* Make graph_array visible before we start tracing */
+
+	smp_mb();
+}
+
+void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+{
+	if (tracing_thresh &&
+	    (trace->rettime - trace->calltime < tracing_thresh))
+		return;
+	else
+		trace_graph_return(trace);
+}
+
 static int graph_trace_init(struct trace_array *tr)
 {
 	int ret;
 
-	graph_array = tr;
-	ret = register_ftrace_graph(&trace_graph_return,
-				    &trace_graph_entry);
+	set_graph_array(tr);
+	if (tracing_thresh)
+		ret = register_ftrace_graph(&trace_graph_thresh_return,
+					    &trace_graph_thresh_entry);
+	else
+		ret = register_ftrace_graph(&trace_graph_return,
+					    &trace_graph_entry);
 	if (ret)
 		return ret;
 	tracing_start_cmdline_record();
@@ -301,11 +326,6 @@ static int graph_trace_init(struct trace_array *tr)
 	return 0;
 }
 
-void set_graph_array(struct trace_array *tr)
-{
-	graph_array = tr;
-}
-
 static void graph_trace_reset(struct trace_array *tr)
 {
 	tracing_stop_cmdline_record();
@@ -470,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
 			 * We need to consume the current entry to see
 			 * the next one.
 			 */
-			ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+			ring_buffer_consume(iter->tr->buffer, iter->cpu,
+					    NULL, NULL);
 			event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
-						 NULL);
+						 NULL, NULL);
 		}
 
 		if (!event)
@@ -486,7 +507,15 @@ get_return_for_leaf(struct trace_iterator *iter,
 			 * if the output fails.
 			 */
 			data->ent = *curr;
-			data->ret = *next;
+			/*
+			 * If the next event is not a return type, then
+			 * we only care about what type it is. Otherwise we can
+			 * safely copy the entire event.
+			 */
+			if (next->ent.type == TRACE_GRAPH_RET)
+				data->ret = *next;
+			else
+				data->ret.ent.type = next->ent.type;
 		}
 	}
 
@@ -506,17 +535,18 @@ get_return_for_leaf(struct trace_iterator *iter,
 
 /* Signal a overhead of time execution to the output */
 static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+print_graph_overhead(unsigned long long duration, struct trace_seq *s,
+		     u32 flags)
 {
 	/* If duration disappear, we don't need anything */
-	if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
+	if (!(flags & TRACE_GRAPH_PRINT_DURATION))
 		return 1;
 
 	/* Non nested entry or return */
 	if (duration == -1)
 		return trace_seq_printf(s, "  ");
 
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+	if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
 		/* Duration exceeded 100 msecs */
 		if (duration > 100000ULL)
 			return trace_seq_printf(s, "! ");
@@ -542,7 +572,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
 
 static enum print_line_t
 print_graph_irq(struct trace_iterator *iter, unsigned long addr,
-		enum trace_type type, int cpu, pid_t pid)
+		enum trace_type type, int cpu, pid_t pid, u32 flags)
 {
 	int ret;
 	struct trace_seq *s = &iter->seq;
@@ -552,21 +582,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 		return TRACE_TYPE_UNHANDLED;
 
 	/* Absolute time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
 		ret = print_graph_abs_time(iter->ts, s);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Cpu */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+	if (flags & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Proc */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+	if (flags & TRACE_GRAPH_PRINT_PROC) {
 		ret = print_graph_proc(s, pid);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -576,7 +606,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 	}
 
 	/* No overhead */
-	ret = print_graph_overhead(-1, s);
+	ret = print_graph_overhead(-1, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -589,7 +619,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Don't close the duration column if haven't one */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+	if (flags & TRACE_GRAPH_PRINT_DURATION)
 		trace_seq_printf(s, " |");
 	ret = trace_seq_printf(s, "\n");
 
@@ -619,7 +649,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 
 	/* Print nsecs (we don't want to exceed 7 numbers) */
 	if (len < 7) {
-		snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
+		snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
+			 nsecs_rem);
 		ret = trace_seq_printf(s, ".%s", nsecs_str);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -659,7 +690,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 static enum print_line_t
 print_graph_entry_leaf(struct trace_iterator *iter,
 		struct ftrace_graph_ent_entry *entry,
-		struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
+		struct ftrace_graph_ret_entry *ret_entry,
+		struct trace_seq *s, u32 flags)
 {
 	struct fgraph_data *data = iter->private;
 	struct ftrace_graph_ret *graph_ret;
@@ -673,24 +705,30 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	duration = graph_ret->rettime - graph_ret->calltime;
 
 	if (data) {
+		struct fgraph_cpu_data *cpu_data;
 		int cpu = iter->cpu;
-		int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+
+		cpu_data = per_cpu_ptr(data->cpu_data, cpu);
 
 		/*
 		 * Comments display at + 1 to depth. Since
 		 * this is a leaf function, keep the comments
 		 * equal to this depth.
 		 */
-		*depth = call->depth - 1;
+		cpu_data->depth = call->depth - 1;
+
+		/* No need to keep this function around for this depth */
+		if (call->depth < FTRACE_RETFUNC_DEPTH)
+			cpu_data->enter_funcs[call->depth] = 0;
 	}
 
 	/* Overhead */
-	ret = print_graph_overhead(duration, s);
+	ret = print_graph_overhead(duration, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Duration */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+	if (flags & TRACE_GRAPH_PRINT_DURATION) {
 		ret = print_graph_duration(duration, s);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -713,7 +751,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 static enum print_line_t
 print_graph_entry_nested(struct trace_iterator *iter,
 			 struct ftrace_graph_ent_entry *entry,
-			 struct trace_seq *s, int cpu)
+			 struct trace_seq *s, int cpu, u32 flags)
 {
 	struct ftrace_graph_ent *call = &entry->graph_ent;
 	struct fgraph_data *data = iter->private;
@@ -721,19 +759,24 @@ print_graph_entry_nested(struct trace_iterator *iter,
 	int i;
 
 	if (data) {
+		struct fgraph_cpu_data *cpu_data;
 		int cpu = iter->cpu;
-		int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
 
-		*depth = call->depth;
+		cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+		cpu_data->depth = call->depth;
+
+		/* Save this function pointer to see if the exit matches */
+		if (call->depth < FTRACE_RETFUNC_DEPTH)
+			cpu_data->enter_funcs[call->depth] = call->func;
 	}
 
 	/* No overhead */
-	ret = print_graph_overhead(-1, s);
+	ret = print_graph_overhead(-1, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* No time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+	if (flags & TRACE_GRAPH_PRINT_DURATION) {
 		ret = trace_seq_printf(s, "            |  ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -759,7 +802,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
 
 static enum print_line_t
 print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
-		     int type, unsigned long addr)
+		     int type, unsigned long addr, u32 flags)
 {
 	struct fgraph_data *data = iter->private;
 	struct trace_entry *ent = iter->ent;
@@ -772,27 +815,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 
 	if (type) {
 		/* Interrupt */
-		ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
+		ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Absolute time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
 		ret = print_graph_abs_time(iter->ts, s);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Cpu */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+	if (flags & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Proc */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+	if (flags & TRACE_GRAPH_PRINT_PROC) {
 		ret = print_graph_proc(s, ent->pid);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -814,7 +857,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
-			struct trace_iterator *iter)
+			struct trace_iterator *iter, u32 flags)
 {
 	struct fgraph_data *data = iter->private;
 	struct ftrace_graph_ent *call = &field->graph_ent;
@@ -822,14 +865,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	static enum print_line_t ret;
 	int cpu = iter->cpu;
 
-	if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
+	if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	leaf_ret = get_return_for_leaf(iter, field);
 	if (leaf_ret)
-		ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
+		ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
 	else
-		ret = print_graph_entry_nested(iter, field, s, cpu);
+		ret = print_graph_entry_nested(iter, field, s, cpu, flags);
 
 	if (data) {
 		/*
@@ -848,37 +891,47 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
-		   struct trace_entry *ent, struct trace_iterator *iter)
+		   struct trace_entry *ent, struct trace_iterator *iter,
+		   u32 flags)
 {
 	unsigned long long duration = trace->rettime - trace->calltime;
 	struct fgraph_data *data = iter->private;
 	pid_t pid = ent->pid;
 	int cpu = iter->cpu;
+	int func_match = 1;
 	int ret;
 	int i;
 
 	if (data) {
+		struct fgraph_cpu_data *cpu_data;
 		int cpu = iter->cpu;
-		int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+
+		cpu_data = per_cpu_ptr(data->cpu_data, cpu);
 
 		/*
 		 * Comments display at + 1 to depth. This is the
 		 * return from a function, we now want the comments
 		 * to display at the same level of the bracket.
 		 */
-		*depth = trace->depth - 1;
+		cpu_data->depth = trace->depth - 1;
+
+		if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+			if (cpu_data->enter_funcs[trace->depth] != trace->func)
+				func_match = 0;
+			cpu_data->enter_funcs[trace->depth] = 0;
+		}
 	}
 
-	if (print_graph_prologue(iter, s, 0, 0))
+	if (print_graph_prologue(iter, s, 0, 0, flags))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Overhead */
-	ret = print_graph_overhead(duration, s);
+	ret = print_graph_overhead(duration, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Duration */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+	if (flags & TRACE_GRAPH_PRINT_DURATION) {
 		ret = print_graph_duration(duration, s);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -891,19 +944,32 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	ret = trace_seq_printf(s, "}\n");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	/*
+	 * If the return function does not have a matching entry,
+	 * then the entry was lost. Instead of just printing
+	 * the '}' and letting the user guess what function this
+	 * belongs to, write out the function name.
+	 */
+	if (func_match) {
+		ret = trace_seq_printf(s, "}\n");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	} else {
+		ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Overrun */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+	if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
 		ret = trace_seq_printf(s, " (Overruns: %lu)\n",
 					trace->overrun);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
+	ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+			      cpu, pid, flags);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -911,8 +977,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 
 static enum print_line_t
-print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
-		    struct trace_iterator *iter)
+print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
+		    struct trace_iterator *iter, u32 flags)
 {
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct fgraph_data *data = iter->private;
@@ -924,16 +990,16 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 	if (data)
 		depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
 
-	if (print_graph_prologue(iter, s, 0, 0))
+	if (print_graph_prologue(iter, s, 0, 0, flags))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* No overhead */
-	ret = print_graph_overhead(-1, s);
+	ret = print_graph_overhead(-1, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* No time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+	if (flags & TRACE_GRAPH_PRINT_DURATION) {
 		ret = trace_seq_printf(s, "            |  ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -968,7 +1034,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 		if (!event)
 			return TRACE_TYPE_UNHANDLED;
 
-		ret = event->trace(iter, sym_flags);
+		ret = event->funcs->trace(iter, sym_flags, event);
 		if (ret != TRACE_TYPE_HANDLED)
 			return ret;
 	}
@@ -988,7 +1054,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 
 
 enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
 	struct ftrace_graph_ent_entry *field;
 	struct fgraph_data *data = iter->private;
@@ -1009,7 +1075,7 @@ print_graph_function(struct trace_iterator *iter)
 	if (data && data->failed) {
 		field = &data->ent;
 		iter->cpu = data->cpu;
-		ret = print_graph_entry(field, s, iter);
+		ret = print_graph_entry(field, s, iter, flags);
 		if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
 			per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
 			ret = TRACE_TYPE_NO_CONSUME;
@@ -1029,32 +1095,50 @@ print_graph_function(struct trace_iterator *iter)
 		struct ftrace_graph_ent_entry saved;
 		trace_assign_type(field, entry);
 		saved = *field;
-		return print_graph_entry(&saved, s, iter);
+		return print_graph_entry(&saved, s, iter, flags);
 	}
 	case TRACE_GRAPH_RET: {
 		struct ftrace_graph_ret_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_return(&field->ret, s, entry, iter);
+		return print_graph_return(&field->ret, s, entry, iter, flags);
 	}
+	case TRACE_STACK:
+	case TRACE_FN:
+		/* dont trace stack and functions as comments */
+		return TRACE_TYPE_UNHANDLED;
+
 	default:
-		return print_graph_comment(s, entry, iter);
+		return print_graph_comment(s, entry, iter, flags);
 	}
 
 	return TRACE_TYPE_HANDLED;
 }
 
-static void print_lat_header(struct seq_file *s)
+static enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+	return print_graph_function_flags(iter, tracer_flags.val);
+}
+
+static enum print_line_t
+print_graph_function_event(struct trace_iterator *iter, int flags,
+			   struct trace_event *event)
+{
+	return print_graph_function(iter);
+}
+
+static void print_lat_header(struct seq_file *s, u32 flags)
 {
 	static const char spaces[] = "                "	/* 16 spaces */
 		"    "					/* 4 spaces */
 		"                 ";			/* 17 spaces */
 	int size = 0;
 
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
 		size += 16;
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+	if (flags & TRACE_GRAPH_PRINT_CPU)
 		size += 4;
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+	if (flags & TRACE_GRAPH_PRINT_PROC)
 		size += 17;
 
 	seq_printf(s, "#%.*s  _-----=> irqs-off        \n", size, spaces);
@@ -1065,43 +1149,48 @@ static void print_lat_header(struct seq_file *s)
 	seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
 
-static void print_graph_headers(struct seq_file *s)
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
 	int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
 
 	if (lat)
-		print_lat_header(s);
+		print_lat_header(s, flags);
 
 	/* 1st line */
 	seq_printf(s, "#");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
 		seq_printf(s, "     TIME       ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+	if (flags & TRACE_GRAPH_PRINT_CPU)
 		seq_printf(s, " CPU");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+	if (flags & TRACE_GRAPH_PRINT_PROC)
 		seq_printf(s, "  TASK/PID       ");
 	if (lat)
 		seq_printf(s, "|||||");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+	if (flags & TRACE_GRAPH_PRINT_DURATION)
 		seq_printf(s, "  DURATION   ");
 	seq_printf(s, "               FUNCTION CALLS\n");
 
 	/* 2nd line */
 	seq_printf(s, "#");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
 		seq_printf(s, "      |         ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+	if (flags & TRACE_GRAPH_PRINT_CPU)
 		seq_printf(s, " |  ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+	if (flags & TRACE_GRAPH_PRINT_PROC)
 		seq_printf(s, "   |    |        ");
 	if (lat)
 		seq_printf(s, "|||||");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+	if (flags & TRACE_GRAPH_PRINT_DURATION)
 		seq_printf(s, "   |   |      ");
 	seq_printf(s, "               |   |   |   |\n");
 }
 
-static void graph_trace_open(struct trace_iterator *iter)
+void print_graph_headers(struct seq_file *s)
+{
+	print_graph_headers_flags(s, tracer_flags.val);
+}
+
+void graph_trace_open(struct trace_iterator *iter)
 {
 	/* pid and depth on the last trace processed */
 	struct fgraph_data *data;
@@ -1136,7 +1225,7 @@ static void graph_trace_open(struct trace_iterator *iter)
 	pr_warning("function graph tracer: not enough memory\n");
 }
 
-static void graph_trace_close(struct trace_iterator *iter)
+void graph_trace_close(struct trace_iterator *iter)
 {
 	struct fgraph_data *data = iter->private;
 
@@ -1146,6 +1235,20 @@ static void graph_trace_close(struct trace_iterator *iter)
 	}
 }
 
+static struct trace_event_functions graph_functions = {
+	.trace		= print_graph_function_event,
+};
+
+static struct trace_event graph_trace_entry_event = {
+	.type		= TRACE_GRAPH_ENT,
+	.funcs		= &graph_functions,
+};
+
+static struct trace_event graph_trace_ret_event = {
+	.type		= TRACE_GRAPH_RET,
+	.funcs		= &graph_functions
+};
+
 static struct tracer graph_trace __read_mostly = {
 	.name		= "function_graph",
 	.open		= graph_trace_open,
@@ -1167,6 +1270,16 @@ static __init int init_graph_trace(void)
 {
 	max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
 
+	if (!register_ftrace_event(&graph_trace_entry_event)) {
+		pr_warning("Warning: could not register graph trace events\n");
+		return 1;
+	}
+
+	if (!register_ftrace_event(&graph_trace_ret_event)) {
+		pr_warning("Warning: could not register graph trace events\n");
+		return 1;
+	}
+
 	return register_tracer(&graph_trace);
 }
 
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f..00000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * h/w branch tracer for x86 based on BTS
- *
- * Copyright (C) 2008-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-
-#include <asm/ds.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-
-#define BTS_BUFFER_SIZE (1 << 13)
-
-static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
-static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
-
-#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
-
-static int trace_hw_branches_enabled __read_mostly;
-static int trace_hw_branches_suspended __read_mostly;
-static struct trace_array *hw_branch_trace __read_mostly;
-
-
-static void bts_trace_init_cpu(int cpu)
-{
-	per_cpu(hwb_tracer, cpu) =
-		ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
-				   BTS_BUFFER_SIZE, NULL, (size_t)-1,
-				   BTS_KERNEL);
-
-	if (IS_ERR(per_cpu(hwb_tracer, cpu)))
-		per_cpu(hwb_tracer, cpu) = NULL;
-}
-
-static int bts_trace_init(struct trace_array *tr)
-{
-	int cpu;
-
-	hw_branch_trace = tr;
-	trace_hw_branches_enabled = 0;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		bts_trace_init_cpu(cpu);
-
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			trace_hw_branches_enabled = 1;
-	}
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-
-	/* If we could not enable tracing on a single cpu, we fail. */
-	return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		if (likely(per_cpu(hwb_tracer, cpu))) {
-			ds_release_bts(per_cpu(hwb_tracer, cpu));
-			per_cpu(hwb_tracer, cpu) = NULL;
-		}
-	}
-	trace_hw_branches_enabled = 0;
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-}
-
-static void bts_trace_start(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_resume_bts(per_cpu(hwb_tracer, cpu));
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-}
-
-static void bts_trace_stop(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-	trace_hw_branches_suspended = 1;
-	put_online_cpus();
-}
-
-static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
-				     unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_DOWN_FAILED:
-		/* The notification is sent with interrupts enabled. */
-		if (trace_hw_branches_enabled) {
-			bts_trace_init_cpu(cpu);
-
-			if (trace_hw_branches_suspended &&
-			    likely(per_cpu(hwb_tracer, cpu)))
-				ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-		}
-		break;
-
-	case CPU_DOWN_PREPARE:
-		/* The notification is sent with interrupts enabled. */
-		if (likely(per_cpu(hwb_tracer, cpu))) {
-			ds_release_bts(per_cpu(hwb_tracer, cpu));
-			per_cpu(hwb_tracer, cpu) = NULL;
-		}
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
-	.notifier_call = bts_hotcpu_handler
-};
-
-static void bts_trace_print_header(struct seq_file *m)
-{
-	seq_puts(m, "# CPU#        TO  <-  FROM\n");
-}
-
-static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
-{
-	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *seq = &iter->seq;
-	struct hw_branch_entry *it;
-
-	trace_assign_type(it, entry);
-
-	if (entry->type == TRACE_HW_BRANCHES) {
-		if (trace_seq_printf(seq, "%4d  ", iter->cpu) &&
-		    seq_print_ip_sym(seq, it->to, symflags) &&
-		    trace_seq_printf(seq, "\t  <-  ") &&
-		    seq_print_ip_sym(seq, it->from, symflags) &&
-		    trace_seq_printf(seq, "\n"))
-			return TRACE_TYPE_HANDLED;
-		return TRACE_TYPE_PARTIAL_LINE;
-	}
-	return TRACE_TYPE_UNHANDLED;
-}
-
-void trace_hw_branch(u64 from, u64 to)
-{
-	struct ftrace_event_call *call = &event_hw_branch;
-	struct trace_array *tr = hw_branch_trace;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buf;
-	struct hw_branch_entry *entry;
-	unsigned long irq1;
-	int cpu;
-
-	if (unlikely(!tr))
-		return;
-
-	if (unlikely(!trace_hw_branches_enabled))
-		return;
-
-	local_irq_save(irq1);
-	cpu = raw_smp_processor_id();
-	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
-		goto out;
-
-	buf = tr->buffer;
-	event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, from);
-	entry->ent.type = TRACE_HW_BRANCHES;
-	entry->from = from;
-	entry->to   = to;
-	if (!filter_check_discard(call, entry, buf, event))
-		trace_buffer_unlock_commit(buf, event, 0, 0);
-
- out:
-	atomic_dec(&tr->data[cpu]->disabled);
-	local_irq_restore(irq1);
-}
-
-static void trace_bts_at(const struct bts_trace *trace, void *at)
-{
-	struct bts_struct bts;
-	int err = 0;
-
-	WARN_ON_ONCE(!trace->read);
-	if (!trace->read)
-		return;
-
-	err = trace->read(this_tracer, at, &bts);
-	if (err < 0)
-		return;
-
-	switch (bts.qualifier) {
-	case BTS_BRANCH:
-		trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
-		break;
-	}
-}
-
-/*
- * Collect the trace on the current cpu and write it into the ftrace buffer.
- *
- * pre: tracing must be suspended on the current cpu
- */
-static void trace_bts_cpu(void *arg)
-{
-	struct trace_array *tr = (struct trace_array *)arg;
-	const struct bts_trace *trace;
-	unsigned char *at;
-
-	if (unlikely(!tr))
-		return;
-
-	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
-		return;
-
-	if (unlikely(!this_tracer))
-		return;
-
-	trace = ds_read_bts(this_tracer);
-	if (!trace)
-		return;
-
-	for (at = trace->ds.top; (void *)at < trace->ds.end;
-	     at += trace->ds.size)
-		trace_bts_at(trace, at);
-
-	for (at = trace->ds.begin; (void *)at < trace->ds.top;
-	     at += trace->ds.size)
-		trace_bts_at(trace, at);
-}
-
-static void trace_bts_prepare(struct trace_iterator *iter)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-	/*
-	 * We need to collect the trace on the respective cpu since ftrace
-	 * implicitly adds the record for the current cpu.
-	 * Once that is more flexible, we could collect the data from any cpu.
-	 */
-	on_each_cpu(trace_bts_cpu, iter->tr, 1);
-
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_resume_bts(per_cpu(hwb_tracer, cpu));
-	put_online_cpus();
-}
-
-static void trace_bts_close(struct trace_iterator *iter)
-{
-	tracing_reset_online_cpus(iter->tr);
-}
-
-void trace_hw_branch_oops(void)
-{
-	if (this_tracer) {
-		ds_suspend_bts_noirq(this_tracer);
-		trace_bts_cpu(hw_branch_trace);
-		ds_resume_bts_noirq(this_tracer);
-	}
-}
-
-struct tracer bts_tracer __read_mostly =
-{
-	.name		= "hw-branch-tracer",
-	.init		= bts_trace_init,
-	.reset		= bts_trace_reset,
-	.print_header	= bts_trace_print_header,
-	.print_line	= bts_trace_print_line,
-	.start		= bts_trace_start,
-	.stop		= bts_trace_stop,
-	.open		= trace_bts_prepare,
-	.close		= trace_bts_close,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_hw_branches,
-#endif /* CONFIG_FTRACE_SELFTEST */
-};
-
-__init static int init_bts_trace(void)
-{
-	register_hotcpu_notifier(&bts_hotcpu_notifier);
-	return register_tracer(&bts_tracer);
-}
-device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c..73a6b0601f2 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
 
 static int save_lat_flag;
 
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
+static int start_irqsoff_tracer(struct trace_array *tr, int graph);
+
 #ifdef CONFIG_PREEMPT_TRACER
 static inline int
 preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
 # define irq_trace() (0)
 #endif
 
+#define TRACE_DISPLAY_GRAPH	1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	/* display latency trace as call graph */
+	{ TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+	{ } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+	.val  = 0,
+	.opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
 /*
  * Sequence count - we record it when starting a measurement and
  * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
 };
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+	int cpu;
+
+	if (!(bit & TRACE_DISPLAY_GRAPH))
+		return -EINVAL;
+
+	if (!(is_graph() ^ set))
+		return 0;
+
+	stop_irqsoff_tracer(irqsoff_trace, !set);
+
+	for_each_possible_cpu(cpu)
+		per_cpu(tracing_cpu, cpu) = 0;
+
+	tracing_max_latency = 0;
+	tracing_reset_online_cpus(irqsoff_trace);
+
+	return start_irqsoff_tracer(irqsoff_trace, set);
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int ret;
+	int cpu;
+	int pc;
+
+	cpu = raw_smp_processor_id();
+	if (likely(!per_cpu(tracing_cpu, cpu)))
+		return 0;
+
+	local_save_flags(flags);
+	/* slight chance to get a false positive on tracing_cpu */
+	if (!irqs_disabled_flags(flags))
+		return 0;
+
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		ret = __trace_graph_entry(tr, trace, flags, pc);
+	} else
+		ret = 0;
+
+	atomic_dec(&data->disabled);
+	return ret;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	cpu = raw_smp_processor_id();
+	if (likely(!per_cpu(tracing_cpu, cpu)))
+		return;
+
+	local_save_flags(flags);
+	/* slight chance to get a false positive on tracing_cpu */
+	if (!irqs_disabled_flags(flags))
+		return;
+
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		__trace_graph_return(tr, trace, flags, pc);
+	}
+
+	atomic_dec(&data->disabled);
+}
+
+static void irqsoff_trace_open(struct trace_iterator *iter)
+{
+	if (is_graph())
+		graph_trace_open(iter);
+
+}
+
+static void irqsoff_trace_close(struct trace_iterator *iter)
+{
+	if (iter->private)
+		graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
+			    TRACE_GRAPH_PRINT_PROC)
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+	u32 flags = GRAPH_TRACER_FLAGS;
+
+	if (trace_flags & TRACE_ITER_LATENCY_FMT)
+		flags |= TRACE_GRAPH_PRINT_DURATION;
+	else
+		flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+	/*
+	 * In graph mode call the graph tracer output function,
+	 * otherwise go with the TRACE_FN event handler
+	 */
+	if (is_graph())
+		return print_graph_function_flags(iter, flags);
+
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_print_header(struct seq_file *s)
+{
+	if (is_graph()) {
+		struct trace_iterator *iter = s->private;
+		u32 flags = GRAPH_TRACER_FLAGS;
+
+		if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+			/* print nothing if the buffers are empty */
+			if (trace_empty(iter))
+				return;
+
+			print_trace_header(s, iter);
+			flags |= TRACE_GRAPH_PRINT_DURATION;
+		} else
+			flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+		print_graph_headers_flags(s, flags);
+	} else
+		trace_default_header(s);
+}
+
+static void
+trace_graph_function(struct trace_array *tr,
+		 unsigned long ip, unsigned long flags, int pc)
+{
+	u64 time = trace_clock_local();
+	struct ftrace_graph_ent ent = {
+		.func  = ip,
+		.depth = 0,
+	};
+	struct ftrace_graph_ret ret = {
+		.func     = ip,
+		.depth    = 0,
+		.calltime = time,
+		.rettime  = time,
+	};
+
+	__trace_graph_entry(tr, &ent, flags, pc);
+	__trace_graph_return(tr, &ret, flags, pc);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+		 unsigned long ip, unsigned long parent_ip,
+		 unsigned long flags, int pc)
+{
+	if (!is_graph())
+		trace_function(tr, ip, parent_ip, flags, pc);
+	else {
+		trace_graph_function(tr, parent_ip, flags, pc);
+		trace_graph_function(tr, ip, flags, pc);
+	}
+}
+
+#else
+#define __trace_function trace_function
+
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+	return -EINVAL;
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+	return -1;
+}
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
+static void irqsoff_print_header(struct seq_file *s) { }
+static void irqsoff_trace_open(struct trace_iterator *iter) { }
+static void irqsoff_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 /*
  * Should this new latency be reported/recorded?
  */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(delta))
 		goto out_unlock;
 
-	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 	/* Skip 5 functions to get to the irq/preempt enable function */
 	__trace_stack(tr, flags, 5, pc);
 
@@ -172,7 +388,7 @@ out_unlock:
 out:
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = ftrace_now(cpu);
-	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 }
 
 static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 
 	local_save_flags(flags);
 
-	trace_function(tr, ip, parent_ip, flags, preempt_count());
+	__trace_function(tr, ip, parent_ip, flags, preempt_count());
 
 	per_cpu(tracing_cpu, cpu) = 1;
 
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_inc(&data->disabled);
 
 	local_save_flags(flags);
-	trace_function(tr, ip, parent_ip, flags, preempt_count());
+	__trace_function(tr, ip, parent_ip, flags, preempt_count());
 	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
-static void start_irqsoff_tracer(struct trace_array *tr)
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
 {
-	register_ftrace_function(&trace_ops);
-	if (tracing_is_enabled())
+	int ret = 0;
+
+	if (!graph)
+		ret = register_ftrace_function(&trace_ops);
+	else
+		ret = register_ftrace_graph(&irqsoff_graph_return,
+					    &irqsoff_graph_entry);
+
+	if (!ret && tracing_is_enabled())
 		tracer_enabled = 1;
 	else
 		tracer_enabled = 0;
+
+	return ret;
 }
 
-static void stop_irqsoff_tracer(struct trace_array *tr)
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
 {
 	tracer_enabled = 0;
-	unregister_ftrace_function(&trace_ops);
+
+	if (!graph)
+		unregister_ftrace_function(&trace_ops);
+	else
+		unregister_ftrace_graph();
 }
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 	/* make sure that the tracer is visible */
 	smp_wmb();
 	tracing_reset_online_cpus(tr);
-	start_irqsoff_tracer(tr);
+
+	if (start_irqsoff_tracer(tr, is_graph()))
+		printk(KERN_ERR "failed to start irqsoff tracer\n");
 }
 
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
-	stop_irqsoff_tracer(tr);
+	stop_irqsoff_tracer(tr, is_graph());
 
 	if (!save_lat_flag)
 		trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,16 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
 	.print_max	= 1,
+	.print_header   = irqsoff_print_header,
+	.print_line     = irqsoff_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_irqsoff,
 #endif
+	.open           = irqsoff_trace_open,
+	.close          = irqsoff_trace_close,
+	.use_max_tr	= 1,
 };
 # define register_irqsoff(trace) register_tracer(&trace)
 #else
@@ -435,9 +673,16 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
 	.print_max	= 1,
+	.print_header   = irqsoff_print_header,
+	.print_line     = irqsoff_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptoff,
 #endif
+	.open		= irqsoff_trace_open,
+	.close		= irqsoff_trace_close,
+	.use_max_tr	= 1,
 };
 # define register_preemptoff(trace) register_tracer(&trace)
 #else
@@ -463,9 +708,16 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
 	.print_max	= 1,
+	.print_header   = irqsoff_print_header,
+	.print_line     = irqsoff_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptirqsoff,
 #endif
+	.open		= irqsoff_trace_open,
+	.close		= irqsoff_trace_close,
+	.use_max_tr	= 1,
 };
 
 # define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 00000000000..7b8ecd751d9
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,136 @@
+/*
+ * kdb helper for dumping the ftrace buffer
+ *
+ * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
+ *
+ * ftrace_dump_buf based on ftrace_dump:
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ */
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/ftrace.h>
+
+#include "../debug/kdb/kdb_private.h"
+#include "trace.h"
+#include "trace_output.h"
+
+static void ftrace_dump_buf(int skip_lines, long cpu_file)
+{
+	/* use static because iter can be a bit big for the stack */
+	static struct trace_iterator iter;
+	unsigned int old_userobj;
+	int cnt = 0, cpu;
+
+	trace_init_global_iter(&iter);
+
+	for_each_tracing_cpu(cpu) {
+		atomic_inc(&iter.tr->data[cpu]->disabled);
+	}
+
+	old_userobj = trace_flags;
+
+	/* don't look at user memory in panic mode */
+	trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+
+	kdb_printf("Dumping ftrace buffer:\n");
+
+	/* reset all but tr, trace, and overruns */
+	memset(&iter.seq, 0,
+		   sizeof(struct trace_iterator) -
+		   offsetof(struct trace_iterator, seq));
+	iter.iter_flags |= TRACE_FILE_LAT_FMT;
+	iter.pos = -1;
+
+	if (cpu_file == TRACE_PIPE_ALL_CPU) {
+		for_each_tracing_cpu(cpu) {
+			iter.buffer_iter[cpu] =
+			ring_buffer_read_prepare(iter.tr->buffer, cpu);
+			ring_buffer_read_start(iter.buffer_iter[cpu]);
+			tracing_iter_reset(&iter, cpu);
+		}
+	} else {
+		iter.cpu_file = cpu_file;
+		iter.buffer_iter[cpu_file] =
+			ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
+		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
+		tracing_iter_reset(&iter, cpu_file);
+	}
+	if (!trace_empty(&iter))
+		trace_find_next_entry_inc(&iter);
+	while (!trace_empty(&iter)) {
+		if (!cnt)
+			kdb_printf("---------------------------------\n");
+		cnt++;
+
+		if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
+			print_trace_line(&iter);
+		if (!skip_lines)
+			trace_printk_seq(&iter.seq);
+		else
+			skip_lines--;
+		if (KDB_FLAG(CMD_INTERRUPT))
+			goto out;
+	}
+
+	if (!cnt)
+		kdb_printf("   (ftrace buffer empty)\n");
+	else
+		kdb_printf("---------------------------------\n");
+
+out:
+	trace_flags = old_userobj;
+
+	for_each_tracing_cpu(cpu) {
+		atomic_dec(&iter.tr->data[cpu]->disabled);
+	}
+
+	for_each_tracing_cpu(cpu)
+		if (iter.buffer_iter[cpu])
+			ring_buffer_read_finish(iter.buffer_iter[cpu]);
+}
+
+/*
+ * kdb_ftdump - Dump the ftrace log buffer
+ */
+static int kdb_ftdump(int argc, const char **argv)
+{
+	int skip_lines = 0;
+	long cpu_file;
+	char *cp;
+
+	if (argc > 2)
+		return KDB_ARGCOUNT;
+
+	if (argc) {
+		skip_lines = simple_strtol(argv[1], &cp, 0);
+		if (*cp)
+			skip_lines = 0;
+	}
+
+	if (argc == 2) {
+		cpu_file = simple_strtol(argv[2], &cp, 0);
+		if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
+		    !cpu_online(cpu_file))
+			return KDB_BADINT;
+	} else {
+		cpu_file = TRACE_PIPE_ALL_CPU;
+	}
+
+	kdb_trap_printk++;
+	ftrace_dump_buf(skip_lines, cpu_file);
+	kdb_trap_printk--;
+
+	return 0;
+}
+
+static __init int kdb_ftrace_register(void)
+{
+	kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+			    "Dump ftrace log", 0, KDB_REPEAT_NONE);
+	return 0;
+}
+
+late_initcall(kdb_ftrace_register);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6ea90c0e2c9..544301d29de 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,6 +29,10 @@
 #include <linux/ctype.h>
 #include <linux/ptrace.h>
 #include <linux/perf_event.h>
+#include <linux/stringify.h>
+#include <linux/limits.h>
+#include <linux/uaccess.h>
+#include <asm/bitsperlong.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -36,11 +40,11 @@
 #define MAX_TRACE_ARGS 128
 #define MAX_ARGSTR_LEN 63
 #define MAX_EVENT_NAME_LEN 64
+#define MAX_STRING_SIZE PATH_MAX
 #define KPROBE_EVENT_SYSTEM "kprobes"
 
 /* Reserved field names */
 #define FIELD_STRING_IP "__probe_ip"
-#define FIELD_STRING_NARGS "__probe_nargs"
 #define FIELD_STRING_RETIP "__probe_ret_ip"
 #define FIELD_STRING_FUNC "__probe_func"
 
@@ -52,60 +56,214 @@ const char *reserved_field_names[] = {
 	"common_tgid",
 	"common_lock_depth",
 	FIELD_STRING_IP,
-	FIELD_STRING_NARGS,
 	FIELD_STRING_RETIP,
 	FIELD_STRING_FUNC,
 };
 
-struct fetch_func {
-	unsigned long (*func)(struct pt_regs *, void *);
-	void *data;
-};
-
-static __kprobes unsigned long call_fetch(struct fetch_func *f,
-					  struct pt_regs *regs)
+/* Printing function type */
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
+				 void *);
+#define PRINT_TYPE_FUNC_NAME(type)	print_type_##type
+#define PRINT_TYPE_FMT_NAME(type)	print_type_format_##type
+
+/* Printing  in basic type function template */
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast)			\
+static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,	\
+						const char *name,	\
+						void *data, void *ent)\
+{									\
+	return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
+}									\
+static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
+
+/* data_rloc: data relative location, compatible with u32 */
+#define make_data_rloc(len, roffs)	\
+	(((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
+#define get_rloc_len(dl)	((u32)(dl) >> 16)
+#define get_rloc_offs(dl)	((u32)(dl) & 0xffff)
+
+static inline void *get_rloc_data(u32 *dl)
 {
-	return f->func(regs, f->data);
+	return (u8 *)dl + get_rloc_offs(*dl);
 }
 
-/* fetch handlers */
-static __kprobes unsigned long fetch_register(struct pt_regs *regs,
-					      void *offset)
+/* For data_loc conversion */
+static inline void *get_loc_data(u32 *dl, void *ent)
 {
-	return regs_get_register(regs, (unsigned int)((unsigned long)offset));
+	return (u8 *)ent + get_rloc_offs(*dl);
 }
 
-static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
-					   void *num)
-{
-	return regs_get_kernel_stack_nth(regs,
-					 (unsigned int)((unsigned long)num));
-}
+/*
+ * Convert data_rloc to data_loc:
+ *  data_rloc stores the offset from data_rloc itself, but data_loc
+ *  stores the offset from event entry.
+ */
+#define convert_rloc_to_loc(dl, offs)	((u32)(dl) + (offs))
 
-static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
+/* For defining macros, define string/string_size types */
+typedef u32 string;
+typedef u32 string_size;
+
+/* Print type function for string type */
+static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
+						  const char *name,
+						  void *data, void *ent)
 {
-	unsigned long retval;
+	int len = *(u32 *)data >> 16;
 
-	if (probe_kernel_address(addr, retval))
-		return 0;
-	return retval;
+	if (!len)
+		return trace_seq_printf(s, " %s=(fault)", name);
+	else
+		return trace_seq_printf(s, " %s=\"%s\"", name,
+					(const char *)get_loc_data(data, ent));
 }
+static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
+
+/* Data fetch function type */
+typedef	void (*fetch_func_t)(struct pt_regs *, void *, void *);
 
-static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
+struct fetch_param {
+	fetch_func_t	fn;
+	void *data;
+};
+
+static __kprobes void call_fetch(struct fetch_param *fprm,
+				 struct pt_regs *regs, void *dest)
 {
-	return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
+	return fprm->fn(regs, fprm->data, dest);
 }
 
-static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
-					      void *dummy)
+#define FETCH_FUNC_NAME(method, type)	fetch_##method##_##type
+/*
+ * Define macro for basic types - we don't need to define s* types, because
+ * we have to care only about bitwidth at recording time.
+ */
+#define DEFINE_BASIC_FETCH_FUNCS(method) \
+DEFINE_FETCH_##method(u8)		\
+DEFINE_FETCH_##method(u16)		\
+DEFINE_FETCH_##method(u32)		\
+DEFINE_FETCH_##method(u64)
+
+#define CHECK_FETCH_FUNCS(method, fn)			\
+	(((FETCH_FUNC_NAME(method, u8) == fn) ||	\
+	  (FETCH_FUNC_NAME(method, u16) == fn) ||	\
+	  (FETCH_FUNC_NAME(method, u32) == fn) ||	\
+	  (FETCH_FUNC_NAME(method, u64) == fn) ||	\
+	  (FETCH_FUNC_NAME(method, string) == fn) ||	\
+	  (FETCH_FUNC_NAME(method, string_size) == fn)) \
+	 && (fn != NULL))
+
+/* Data fetch function templates */
+#define DEFINE_FETCH_reg(type)						\
+static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,	\
+					void *offset, void *dest)	\
+{									\
+	*(type *)dest = (type)regs_get_register(regs,			\
+				(unsigned int)((unsigned long)offset));	\
+}
+DEFINE_BASIC_FETCH_FUNCS(reg)
+/* No string on the register */
+#define fetch_reg_string NULL
+#define fetch_reg_string_size NULL
+
+#define DEFINE_FETCH_stack(type)					\
+static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+					  void *offset, void *dest)	\
+{									\
+	*(type *)dest = (type)regs_get_kernel_stack_nth(regs,		\
+				(unsigned int)((unsigned long)offset));	\
+}
+DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string NULL
+#define fetch_stack_string_size NULL
+
+#define DEFINE_FETCH_retval(type)					\
+static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
+					  void *dummy, void *dest)	\
+{									\
+	*(type *)dest = (type)regs_return_value(regs);			\
+}
+DEFINE_BASIC_FETCH_FUNCS(retval)
+/* No string on the retval */
+#define fetch_retval_string NULL
+#define fetch_retval_string_size NULL
+
+#define DEFINE_FETCH_memory(type)					\
+static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+					  void *addr, void *dest)	\
+{									\
+	type retval;							\
+	if (probe_kernel_address(addr, retval))				\
+		*(type *)dest = 0;					\
+	else								\
+		*(type *)dest = retval;					\
+}
+DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+						      void *addr, void *dest)
 {
-	return regs_return_value(regs);
+	long ret;
+	int maxlen = get_rloc_len(*(u32 *)dest);
+	u8 *dst = get_rloc_data(dest);
+	u8 *src = addr;
+	mm_segment_t old_fs = get_fs();
+	if (!maxlen)
+		return;
+	/*
+	 * Try to get string again, since the string can be changed while
+	 * probing.
+	 */
+	set_fs(KERNEL_DS);
+	pagefault_disable();
+	do
+		ret = __copy_from_user_inatomic(dst++, src++, 1);
+	while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
+	dst[-1] = '\0';
+	pagefault_enable();
+	set_fs(old_fs);
+
+	if (ret < 0) {	/* Failed to fetch string */
+		((u8 *)get_rloc_data(dest))[0] = '\0';
+		*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
+	} else
+		*(u32 *)dest = make_data_rloc(src - (u8 *)addr,
+					      get_rloc_offs(*(u32 *)dest));
 }
-
-static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
-						   void *dummy)
+/* Return the length of string -- including null terminal byte */
+static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+							void *addr, void *dest)
 {
-	return kernel_stack_pointer(regs);
+	int ret, len = 0;
+	u8 c;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(KERNEL_DS);
+	pagefault_disable();
+	do {
+		ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+		len++;
+	} while (c && ret == 0 && len < MAX_STRING_SIZE);
+	pagefault_enable();
+	set_fs(old_fs);
+
+	if (ret < 0)	/* Failed to check the length */
+		*(u32 *)dest = 0;
+	else
+		*(u32 *)dest = len;
 }
 
 /* Memory fetching by symbol */
@@ -150,51 +308,168 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
 	return sc;
 }
 
-static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
-{
-	struct symbol_cache *sc = data;
-
-	if (sc->addr)
-		return fetch_memory(regs, (void *)sc->addr);
-	else
-		return 0;
+#define DEFINE_FETCH_symbol(type)					\
+static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
+					  void *data, void *dest)	\
+{									\
+	struct symbol_cache *sc = data;					\
+	if (sc->addr)							\
+		fetch_memory_##type(regs, (void *)sc->addr, dest);	\
+	else								\
+		*(type *)dest = 0;					\
 }
+DEFINE_BASIC_FETCH_FUNCS(symbol)
+DEFINE_FETCH_symbol(string)
+DEFINE_FETCH_symbol(string_size)
 
-/* Special indirect memory access interface */
-struct indirect_fetch_data {
-	struct fetch_func orig;
+/* Dereference memory access function */
+struct deref_fetch_param {
+	struct fetch_param orig;
 	long offset;
 };
 
-static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
-{
-	struct indirect_fetch_data *ind = data;
-	unsigned long addr;
-
-	addr = call_fetch(&ind->orig, regs);
-	if (addr) {
-		addr += ind->offset;
-		return fetch_memory(regs, (void *)addr);
-	} else
-		return 0;
+#define DEFINE_FETCH_deref(type)					\
+static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
+					    void *data, void *dest)	\
+{									\
+	struct deref_fetch_param *dprm = data;				\
+	unsigned long addr;						\
+	call_fetch(&dprm->orig, regs, &addr);				\
+	if (addr) {							\
+		addr += dprm->offset;					\
+		fetch_memory_##type(regs, (void *)addr, dest);		\
+	} else								\
+		*(type *)dest = 0;					\
 }
+DEFINE_BASIC_FETCH_FUNCS(deref)
+DEFINE_FETCH_deref(string)
+DEFINE_FETCH_deref(string_size)
 
-static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
+static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
 {
-	if (data->orig.func == fetch_indirect)
-		free_indirect_fetch_data(data->orig.data);
-	else if (data->orig.func == fetch_symbol)
+	if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+		free_deref_fetch_param(data->orig.data);
+	else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
 		free_symbol_cache(data->orig.data);
 	kfree(data);
 }
 
+/* Default (unsigned long) fetch type */
+#define __DEFAULT_FETCH_TYPE(t) u##t
+#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
+#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
+#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
+
+/* Fetch types */
+enum {
+	FETCH_MTD_reg = 0,
+	FETCH_MTD_stack,
+	FETCH_MTD_retval,
+	FETCH_MTD_memory,
+	FETCH_MTD_symbol,
+	FETCH_MTD_deref,
+	FETCH_MTD_END,
+};
+
+#define ASSIGN_FETCH_FUNC(method, type)	\
+	[FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
+
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype)	\
+	{.name = _name,				\
+	 .size = _size,					\
+	 .is_signed = sign,				\
+	 .print = PRINT_TYPE_FUNC_NAME(ptype),		\
+	 .fmt = PRINT_TYPE_FMT_NAME(ptype),		\
+	 .fmttype = _fmttype,				\
+	 .fetch = {					\
+ASSIGN_FETCH_FUNC(reg, ftype),				\
+ASSIGN_FETCH_FUNC(stack, ftype),			\
+ASSIGN_FETCH_FUNC(retval, ftype),			\
+ASSIGN_FETCH_FUNC(memory, ftype),			\
+ASSIGN_FETCH_FUNC(symbol, ftype),			\
+ASSIGN_FETCH_FUNC(deref, ftype),			\
+	  }						\
+	}
+
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)			\
+	__ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+
+#define FETCH_TYPE_STRING 0
+#define FETCH_TYPE_STRSIZE 1
+
+/* Fetch type information table */
+static const struct fetch_type {
+	const char	*name;		/* Name of type */
+	size_t		size;		/* Byte size of type */
+	int		is_signed;	/* Signed flag */
+	print_type_func_t	print;	/* Print functions */
+	const char	*fmt;		/* Fromat string */
+	const char	*fmttype;	/* Name in format file */
+	/* Fetch functions */
+	fetch_func_t	fetch[FETCH_MTD_END];
+} fetch_type_table[] = {
+	/* Special types */
+	[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+					sizeof(u32), 1, "__data_loc char[]"),
+	[FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+					string_size, sizeof(u32), 0, "u32"),
+	/* Basic types */
+	ASSIGN_FETCH_TYPE(u8,  u8,  0),
+	ASSIGN_FETCH_TYPE(u16, u16, 0),
+	ASSIGN_FETCH_TYPE(u32, u32, 0),
+	ASSIGN_FETCH_TYPE(u64, u64, 0),
+	ASSIGN_FETCH_TYPE(s8,  u8,  1),
+	ASSIGN_FETCH_TYPE(s16, u16, 1),
+	ASSIGN_FETCH_TYPE(s32, u32, 1),
+	ASSIGN_FETCH_TYPE(s64, u64, 1),
+};
+
+static const struct fetch_type *find_fetch_type(const char *type)
+{
+	int i;
+
+	if (!type)
+		type = DEFAULT_FETCH_TYPE_STR;
+
+	for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
+		if (strcmp(type, fetch_type_table[i].name) == 0)
+			return &fetch_type_table[i];
+	return NULL;
+}
+
+/* Special function : only accept unsigned long */
+static __kprobes void fetch_stack_address(struct pt_regs *regs,
+					  void *dummy, void *dest)
+{
+	*(unsigned long *)dest = kernel_stack_pointer(regs);
+}
+
+static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
+					    fetch_func_t orig_fn)
+{
+	int i;
+
+	if (type != &fetch_type_table[FETCH_TYPE_STRING])
+		return NULL;	/* Only string type needs size function */
+	for (i = 0; i < FETCH_MTD_END; i++)
+		if (type->fetch[i] == orig_fn)
+			return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
+
+	WARN_ON(1);	/* This should not happen */
+	return NULL;
+}
+
 /**
  * Kprobe event core functions
  */
 
 struct probe_arg {
-	struct fetch_func	fetch;
-	const char		*name;
+	struct fetch_param	fetch;
+	struct fetch_param	fetch_size;
+	unsigned int		offset;	/* Offset from argument entry */
+	const char		*name;	/* Name of this argument */
+	const char		*comm;	/* Command of this argument */
+	const struct fetch_type	*type;	/* Type of this argument */
 };
 
 /* Flags for trace_probe */
@@ -207,8 +482,9 @@ struct trace_probe {
 	unsigned long 		nhit;
 	unsigned int		flags;	/* For TP_FLAG_* */
 	const char		*symbol;	/* symbol name */
+	struct ftrace_event_class	class;
 	struct ftrace_event_call	call;
-	struct trace_event		event;
+	ssize_t			size;		/* trace entry size */
 	unsigned int		nr_args;
 	struct probe_arg	args[];
 };
@@ -217,6 +493,7 @@ struct trace_probe {
 	(offsetof(struct trace_probe, args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
+
 static __kprobes int probe_is_return(struct trace_probe *tp)
 {
 	return tp->rp.handler != NULL;
@@ -227,51 +504,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
 	return tp->symbol ? tp->symbol : "unknown";
 }
 
-static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
-{
-	int ret = -EINVAL;
-
-	if (ff->func == fetch_argument)
-		ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
-	else if (ff->func == fetch_register) {
-		const char *name;
-		name = regs_query_register_name((unsigned int)((long)ff->data));
-		ret = snprintf(buf, n, "%%%s", name);
-	} else if (ff->func == fetch_stack)
-		ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
-	else if (ff->func == fetch_memory)
-		ret = snprintf(buf, n, "@0x%p", ff->data);
-	else if (ff->func == fetch_symbol) {
-		struct symbol_cache *sc = ff->data;
-		if (sc->offset)
-			ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
-					sc->offset);
-		else
-			ret = snprintf(buf, n, "@%s", sc->symbol);
-	} else if (ff->func == fetch_retvalue)
-		ret = snprintf(buf, n, "$retval");
-	else if (ff->func == fetch_stack_address)
-		ret = snprintf(buf, n, "$stack");
-	else if (ff->func == fetch_indirect) {
-		struct indirect_fetch_data *id = ff->data;
-		size_t l = 0;
-		ret = snprintf(buf, n, "%+ld(", id->offset);
-		if (ret >= n)
-			goto end;
-		l += ret;
-		ret = probe_arg_string(buf + l, n - l, &id->orig);
-		if (ret < 0)
-			goto end;
-		l += ret;
-		ret = snprintf(buf + l, n - l, ")");
-		ret += l;
-	}
-end:
-	if (ret >= n)
-		return -ENOSPC;
-	return ret;
-}
-
 static int register_probe_event(struct trace_probe *tp);
 static void unregister_probe_event(struct trace_probe *tp);
 
@@ -282,8 +514,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
 static int kretprobe_dispatcher(struct kretprobe_instance *ri,
 				struct pt_regs *regs);
 
-/* Check the name is good for event/group */
-static int check_event_name(const char *name)
+/* Check the name is good for event/group/fields */
+static int is_good_name(const char *name)
 {
 	if (!isalpha(*name) && *name != '_')
 		return 0;
@@ -325,22 +557,23 @@ static struct trace_probe *alloc_trace_probe(const char *group,
 	else
 		tp->rp.kp.pre_handler = kprobe_dispatcher;
 
-	if (!event || !check_event_name(event)) {
+	if (!event || !is_good_name(event)) {
 		ret = -EINVAL;
 		goto error;
 	}
 
+	tp->call.class = &tp->class;
 	tp->call.name = kstrdup(event, GFP_KERNEL);
 	if (!tp->call.name)
 		goto error;
 
-	if (!group || !check_event_name(group)) {
+	if (!group || !is_good_name(group)) {
 		ret = -EINVAL;
 		goto error;
 	}
 
-	tp->call.system = kstrdup(group, GFP_KERNEL);
-	if (!tp->call.system)
+	tp->class.system = kstrdup(group, GFP_KERNEL);
+	if (!tp->class.system)
 		goto error;
 
 	INIT_LIST_HEAD(&tp->list);
@@ -354,11 +587,12 @@ error:
 
 static void free_probe_arg(struct probe_arg *arg)
 {
-	if (arg->fetch.func == fetch_symbol)
+	if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+		free_deref_fetch_param(arg->fetch.data);
+	else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
 		free_symbol_cache(arg->fetch.data);
-	else if (arg->fetch.func == fetch_indirect)
-		free_indirect_fetch_data(arg->fetch.data);
 	kfree(arg->name);
+	kfree(arg->comm);
 }
 
 static void free_trace_probe(struct trace_probe *tp)
@@ -368,7 +602,7 @@ static void free_trace_probe(struct trace_probe *tp)
 	for (i = 0; i < tp->nr_args; i++)
 		free_probe_arg(&tp->args[i]);
 
-	kfree(tp->call.system);
+	kfree(tp->call.class->system);
 	kfree(tp->call.name);
 	kfree(tp->symbol);
 	kfree(tp);
@@ -381,7 +615,7 @@ static struct trace_probe *find_probe_event(const char *event,
 
 	list_for_each_entry(tp, &probe_list, list)
 		if (strcmp(tp->call.name, event) == 0 &&
-		    strcmp(tp->call.system, group) == 0)
+		    strcmp(tp->call.class->system, group) == 0)
 			return tp;
 	return NULL;
 }
@@ -406,7 +640,7 @@ static int register_trace_probe(struct trace_probe *tp)
 	mutex_lock(&probe_lock);
 
 	/* register as an event */
-	old_tp = find_probe_event(tp->call.name, tp->call.system);
+	old_tp = find_probe_event(tp->call.name, tp->call.class->system);
 	if (old_tp) {
 		/* delete old event */
 		unregister_trace_probe(old_tp);
@@ -464,46 +698,41 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
 #define PARAM_MAX_ARGS 16
 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
 
-static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
+static int parse_probe_vars(char *arg, const struct fetch_type *t,
+			    struct fetch_param *f, int is_return)
 {
 	int ret = 0;
 	unsigned long param;
 
 	if (strcmp(arg, "retval") == 0) {
-		if (is_return) {
-			ff->func = fetch_retvalue;
-			ff->data = NULL;
-		} else
+		if (is_return)
+			f->fn = t->fetch[FETCH_MTD_retval];
+		else
 			ret = -EINVAL;
 	} else if (strncmp(arg, "stack", 5) == 0) {
 		if (arg[5] == '\0') {
-			ff->func = fetch_stack_address;
-			ff->data = NULL;
+			if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
+				f->fn = fetch_stack_address;
+			else
+				ret = -EINVAL;
 		} else if (isdigit(arg[5])) {
 			ret = strict_strtoul(arg + 5, 10, &param);
 			if (ret || param > PARAM_MAX_STACK)
 				ret = -EINVAL;
 			else {
-				ff->func = fetch_stack;
-				ff->data = (void *)param;
+				f->fn = t->fetch[FETCH_MTD_stack];
+				f->data = (void *)param;
 			}
 		} else
 			ret = -EINVAL;
-	} else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
-		ret = strict_strtoul(arg + 3, 10, &param);
-		if (ret || param > PARAM_MAX_ARGS)
-			ret = -EINVAL;
-		else {
-			ff->func = fetch_argument;
-			ff->data = (void *)param;
-		}
 	} else
 		ret = -EINVAL;
 	return ret;
 }
 
 /* Recursive argument parser */
-static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+static int __parse_probe_arg(char *arg, const struct fetch_type *t,
+			     struct fetch_param *f, int is_return)
 {
 	int ret = 0;
 	unsigned long param;
@@ -512,13 +741,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 
 	switch (arg[0]) {
 	case '$':
-		ret = parse_probe_vars(arg + 1, ff, is_return);
+		ret = parse_probe_vars(arg + 1, t, f, is_return);
 		break;
 	case '%':	/* named register */
 		ret = regs_query_register_offset(arg + 1);
 		if (ret >= 0) {
-			ff->func = fetch_register;
-			ff->data = (void *)(unsigned long)ret;
+			f->fn = t->fetch[FETCH_MTD_reg];
+			f->data = (void *)(unsigned long)ret;
 			ret = 0;
 		}
 		break;
@@ -527,26 +756,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 			ret = strict_strtoul(arg + 1, 0, &param);
 			if (ret)
 				break;
-			ff->func = fetch_memory;
-			ff->data = (void *)param;
+			f->fn = t->fetch[FETCH_MTD_memory];
+			f->data = (void *)param;
 		} else {
 			ret = split_symbol_offset(arg + 1, &offset);
 			if (ret)
 				break;
-			ff->data = alloc_symbol_cache(arg + 1, offset);
-			if (ff->data)
-				ff->func = fetch_symbol;
-			else
-				ret = -EINVAL;
+			f->data = alloc_symbol_cache(arg + 1, offset);
+			if (f->data)
+				f->fn = t->fetch[FETCH_MTD_symbol];
 		}
 		break;
-	case '+':	/* indirect memory */
+	case '+':	/* deref memory */
 	case '-':
 		tmp = strchr(arg, '(');
-		if (!tmp) {
-			ret = -EINVAL;
+		if (!tmp)
 			break;
-		}
 		*tmp = '\0';
 		ret = strict_strtol(arg + 1, 0, &offset);
 		if (ret)
@@ -556,38 +781,68 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 		arg = tmp + 1;
 		tmp = strrchr(arg, ')');
 		if (tmp) {
-			struct indirect_fetch_data *id;
+			struct deref_fetch_param *dprm;
+			const struct fetch_type *t2 = find_fetch_type(NULL);
 			*tmp = '\0';
-			id = kzalloc(sizeof(struct indirect_fetch_data),
-				     GFP_KERNEL);
-			if (!id)
+			dprm = kzalloc(sizeof(struct deref_fetch_param),
+				       GFP_KERNEL);
+			if (!dprm)
 				return -ENOMEM;
-			id->offset = offset;
-			ret = __parse_probe_arg(arg, &id->orig, is_return);
+			dprm->offset = offset;
+			ret = __parse_probe_arg(arg, t2, &dprm->orig,
+						is_return);
 			if (ret)
-				kfree(id);
+				kfree(dprm);
 			else {
-				ff->func = fetch_indirect;
-				ff->data = (void *)id;
+				f->fn = t->fetch[FETCH_MTD_deref];
+				f->data = (void *)dprm;
 			}
-		} else
-			ret = -EINVAL;
+		}
 		break;
-	default:
-		/* TODO: support custom handler */
+	}
+	if (!ret && !f->fn) {	/* Parsed, but do not find fetch method */
+		pr_info("%s type has no corresponding fetch method.\n",
+			t->name);
 		ret = -EINVAL;
 	}
 	return ret;
 }
 
 /* String length checking wrapper */
-static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+static int parse_probe_arg(char *arg, struct trace_probe *tp,
+			   struct probe_arg *parg, int is_return)
 {
+	const char *t;
+	int ret;
+
 	if (strlen(arg) > MAX_ARGSTR_LEN) {
 		pr_info("Argument is too long.: %s\n",  arg);
 		return -ENOSPC;
 	}
-	return __parse_probe_arg(arg, ff, is_return);
+	parg->comm = kstrdup(arg, GFP_KERNEL);
+	if (!parg->comm) {
+		pr_info("Failed to allocate memory for command '%s'.\n", arg);
+		return -ENOMEM;
+	}
+	t = strchr(parg->comm, ':');
+	if (t) {
+		arg[t - parg->comm] = '\0';
+		t++;
+	}
+	parg->type = find_fetch_type(t);
+	if (!parg->type) {
+		pr_info("Unsupported type: %s\n", t);
+		return -EINVAL;
+	}
+	parg->offset = tp->size;
+	tp->size += parg->type->size;
+	ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+	if (ret >= 0) {
+		parg->fetch_size.fn = get_fetch_size_function(parg->type,
+							      parg->fetch.fn);
+		parg->fetch_size.data = parg->fetch.data;
+	}
+	return ret;
 }
 
 /* Return 1 if name is reserved or already used by another argument */
@@ -611,22 +866,24 @@ static int create_trace_probe(int argc, char **argv)
 	 *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
 	 *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
 	 * Fetch args:
-	 *  $argN	: fetch Nth of function argument. (N:0-)
 	 *  $retval	: fetch return value
 	 *  $stack	: fetch stack address
 	 *  $stackN	: fetch Nth of stack (N:0-)
 	 *  @ADDR	: fetch memory at ADDR (ADDR should be in kernel)
 	 *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
 	 *  %REG	: fetch register REG
-	 * Indirect memory fetch:
+	 * Dereferencing memory fetch:
 	 *  +|-offs(ARG) : fetch memory at ARG +|- offs address.
 	 * Alias name of args:
 	 *  NAME=FETCHARG : set NAME as alias of FETCHARG.
+	 * Type of args:
+	 *  FETCHARG:TYPE : use TYPE instead of unsigned long.
 	 */
 	struct trace_probe *tp;
 	int i, ret = 0;
 	int is_return = 0, is_delete = 0;
-	char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
+	char *symbol = NULL, *event = NULL, *group = NULL;
+	char *arg;
 	unsigned long offset = 0;
 	void *addr = NULL;
 	char buf[MAX_EVENT_NAME_LEN];
@@ -651,12 +908,12 @@ static int create_trace_probe(int argc, char **argv)
 			event = strchr(group, '/') + 1;
 			event[-1] = '\0';
 			if (strlen(group) == 0) {
-				pr_info("Group name is not specifiled\n");
+				pr_info("Group name is not specified\n");
 				return -EINVAL;
 			}
 		}
 		if (strlen(event) == 0) {
-			pr_info("Event name is not specifiled\n");
+			pr_info("Event name is not specified\n");
 			return -EINVAL;
 		}
 	}
@@ -668,14 +925,17 @@ static int create_trace_probe(int argc, char **argv)
 			pr_info("Delete command needs an event name.\n");
 			return -EINVAL;
 		}
+		mutex_lock(&probe_lock);
 		tp = find_probe_event(event, group);
 		if (!tp) {
+			mutex_unlock(&probe_lock);
 			pr_info("Event %s/%s doesn't exist.\n", group, event);
 			return -ENOENT;
 		}
 		/* delete an event */
 		unregister_trace_probe(tp);
 		free_trace_probe(tp);
+		mutex_unlock(&probe_lock);
 		return 0;
 	}
 
@@ -689,7 +949,7 @@ static int create_trace_probe(int argc, char **argv)
 			return -EINVAL;
 		}
 		/* an address specified */
-		ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
+		ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
 		if (ret) {
 			pr_info("Failed to parse address.\n");
 			return ret;
@@ -732,37 +992,47 @@ static int create_trace_probe(int argc, char **argv)
 	/* parse arguments */
 	ret = 0;
 	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+		/* Increment count for freeing args in error case */
+		tp->nr_args++;
+
 		/* Parse argument name */
 		arg = strchr(argv[i], '=');
-		if (arg)
+		if (arg) {
 			*arg++ = '\0';
-		else
+			tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+		} else {
 			arg = argv[i];
+			/* If argument name is omitted, set "argN" */
+			snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
+			tp->args[i].name = kstrdup(buf, GFP_KERNEL);
+		}
 
-		if (conflict_field_name(argv[i], tp->args, i)) {
-			pr_info("Argument%d name '%s' conflicts with "
-				"another field.\n", i, argv[i]);
+		if (!tp->args[i].name) {
+			pr_info("Failed to allocate argument[%d] name.\n", i);
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		if (!is_good_name(tp->args[i].name)) {
+			pr_info("Invalid argument[%d] name: %s\n",
+				i, tp->args[i].name);
 			ret = -EINVAL;
 			goto error;
 		}
 
-		tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
-		if (!tp->args[i].name) {
-			pr_info("Failed to allocate argument%d name '%s'.\n",
-				i, argv[i]);
-			ret = -ENOMEM;
+		if (conflict_field_name(tp->args[i].name, tp->args, i)) {
+			pr_info("Argument[%d] name '%s' conflicts with "
+				"another field.\n", i, argv[i]);
+			ret = -EINVAL;
 			goto error;
 		}
 
 		/* Parse fetch argument */
-		ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
+		ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
 		if (ret) {
-			pr_info("Parse error at argument%d. (%d)\n", i, ret);
-			kfree(tp->args[i].name);
+			pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
 			goto error;
 		}
-
-		tp->nr_args++;
 	}
 
 	ret = register_trace_probe(tp);
@@ -810,11 +1080,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
 static int probes_seq_show(struct seq_file *m, void *v)
 {
 	struct trace_probe *tp = v;
-	int i, ret;
-	char buf[MAX_ARGSTR_LEN + 1];
+	int i;
 
 	seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
-	seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
+	seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
 
 	if (!tp->symbol)
 		seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -823,15 +1092,10 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	else
 		seq_printf(m, " %s", probe_symbol(tp));
 
-	for (i = 0; i < tp->nr_args; i++) {
-		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
-		if (ret < 0) {
-			pr_warning("Argument%d decoding error(%d).\n", i, ret);
-			return ret;
-		}
-		seq_printf(m, " %s=%s", tp->args[i].name, buf);
-	}
+	for (i = 0; i < tp->nr_args; i++)
+		seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
 	seq_printf(m, "\n");
+
 	return 0;
 }
 
@@ -957,14 +1221,62 @@ static const struct file_operations kprobe_profile_ops = {
 	.release        = seq_release,
 };
 
+/* Sum up total data length for dynamic arraies (strings) */
+static __kprobes int __get_data_size(struct trace_probe *tp,
+				     struct pt_regs *regs)
+{
+	int i, ret = 0;
+	u32 len;
+
+	for (i = 0; i < tp->nr_args; i++)
+		if (unlikely(tp->args[i].fetch_size.fn)) {
+			call_fetch(&tp->args[i].fetch_size, regs, &len);
+			ret += len;
+		}
+
+	return ret;
+}
+
+/* Store the value of each argument */
+static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
+				       struct pt_regs *regs,
+				       u8 *data, int maxlen)
+{
+	int i;
+	u32 end = tp->size;
+	u32 *dl;	/* Data (relative) location */
+
+	for (i = 0; i < tp->nr_args; i++) {
+		if (unlikely(tp->args[i].fetch_size.fn)) {
+			/*
+			 * First, we set the relative location and
+			 * maximum data length to *dl
+			 */
+			dl = (u32 *)(data + tp->args[i].offset);
+			*dl = make_data_rloc(maxlen, end - tp->args[i].offset);
+			/* Then try to fetch string or dynamic array data */
+			call_fetch(&tp->args[i].fetch, regs, dl);
+			/* Reduce maximum length */
+			end += get_rloc_len(*dl);
+			maxlen -= get_rloc_len(*dl);
+			/* Trick here, convert data_rloc to data_loc */
+			*dl = convert_rloc_to_loc(*dl,
+				 ent_size + tp->args[i].offset);
+		} else
+			/* Just fetching data normally */
+			call_fetch(&tp->args[i].fetch, regs,
+				   data + tp->args[i].offset);
+	}
+}
+
 /* Kprobe handler */
-static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
-	struct kprobe_trace_entry *entry;
+	struct kprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
-	int size, i, pc;
+	int size, dsize, pc;
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
 
@@ -973,72 +1285,67 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-	size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+	dsize = __get_data_size(tp, regs);
+	size = sizeof(*entry) + tp->size + dsize;
 
-	event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
-						  irq_flags, pc);
+	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+						  size, irq_flags, pc);
 	if (!event)
-		return 0;
+		return;
 
 	entry = ring_buffer_event_data(event);
-	entry->nargs = tp->nr_args;
 	entry->ip = (unsigned long)kp->addr;
-	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
-	return 0;
 }
 
 /* Kretprobe handler */
-static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 					  struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
-	struct kretprobe_trace_entry *entry;
+	struct kretprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
-	int size, i, pc;
+	int size, pc, dsize;
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
 
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-	size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+	dsize = __get_data_size(tp, regs);
+	size = sizeof(*entry) + tp->size + dsize;
 
-	event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
-						  irq_flags, pc);
+	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+						  size, irq_flags, pc);
 	if (!event)
-		return 0;
+		return;
 
 	entry = ring_buffer_event_data(event);
-	entry->nargs = tp->nr_args;
 	entry->func = (unsigned long)tp->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
-	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
-
-	return 0;
 }
 
 /* Event entry printers */
 enum print_line_t
-print_kprobe_event(struct trace_iterator *iter, int flags)
+print_kprobe_event(struct trace_iterator *iter, int flags,
+		   struct trace_event *event)
 {
-	struct kprobe_trace_entry *field;
+	struct kprobe_trace_entry_head *field;
 	struct trace_seq *s = &iter->seq;
-	struct trace_event *event;
 	struct trace_probe *tp;
+	u8 *data;
 	int i;
 
-	field = (struct kprobe_trace_entry *)iter->ent;
-	event = ftrace_find_event(field->ent.type);
-	tp = container_of(event, struct trace_probe, event);
+	field = (struct kprobe_trace_entry_head *)iter->ent;
+	tp = container_of(event, struct trace_probe, call.event);
 
 	if (!trace_seq_printf(s, "%s: (", tp->call.name))
 		goto partial;
@@ -1049,9 +1356,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
 	if (!trace_seq_puts(s, ")"))
 		goto partial;
 
-	for (i = 0; i < field->nargs; i++)
-		if (!trace_seq_printf(s, " %s=%lx",
-				      tp->args[i].name, field->args[i]))
+	data = (u8 *)&field[1];
+	for (i = 0; i < tp->nr_args; i++)
+		if (!tp->args[i].type->print(s, tp->args[i].name,
+					     data + tp->args[i].offset, field))
 			goto partial;
 
 	if (!trace_seq_puts(s, "\n"))
@@ -1063,17 +1371,17 @@ partial:
 }
 
 enum print_line_t
-print_kretprobe_event(struct trace_iterator *iter, int flags)
+print_kretprobe_event(struct trace_iterator *iter, int flags,
+		      struct trace_event *event)
 {
-	struct kretprobe_trace_entry *field;
+	struct kretprobe_trace_entry_head *field;
 	struct trace_seq *s = &iter->seq;
-	struct trace_event *event;
 	struct trace_probe *tp;
+	u8 *data;
 	int i;
 
-	field = (struct kretprobe_trace_entry *)iter->ent;
-	event = ftrace_find_event(field->ent.type);
-	tp = container_of(event, struct trace_probe, event);
+	field = (struct kretprobe_trace_entry_head *)iter->ent;
+	tp = container_of(event, struct trace_probe, call.event);
 
 	if (!trace_seq_printf(s, "%s: (", tp->call.name))
 		goto partial;
@@ -1090,9 +1398,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
 	if (!trace_seq_puts(s, ")"))
 		goto partial;
 
-	for (i = 0; i < field->nargs; i++)
-		if (!trace_seq_printf(s, " %s=%lx",
-				      tp->args[i].name, field->args[i]))
+	data = (u8 *)&field[1];
+	for (i = 0; i < tp->nr_args; i++)
+		if (!tp->args[i].type->print(s, tp->args[i].name,
+					     data + tp->args[i].offset, field))
 			goto partial;
 
 	if (!trace_seq_puts(s, "\n"))
@@ -1127,13 +1436,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
 	}
 }
 
-static int probe_event_raw_init(struct ftrace_event_call *event_call)
-{
-	INIT_LIST_HEAD(&event_call->fields);
-
-	return 0;
-}
-
 #undef DEFINE_FIELD
 #define DEFINE_FIELD(type, item, name, is_signed)			\
 	do {								\
@@ -1148,242 +1450,172 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call)
 static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
 	int ret, i;
-	struct kprobe_trace_entry field;
+	struct kprobe_trace_entry_head field;
 	struct trace_probe *tp = (struct trace_probe *)event_call->data;
 
 	DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
-	DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
 	/* Set argument names as fields */
-	for (i = 0; i < tp->nr_args; i++)
-		DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+	for (i = 0; i < tp->nr_args; i++) {
+		ret = trace_define_field(event_call, tp->args[i].type->fmttype,
+					 tp->args[i].name,
+					 sizeof(field) + tp->args[i].offset,
+					 tp->args[i].type->size,
+					 tp->args[i].type->is_signed,
+					 FILTER_OTHER);
+		if (ret)
+			return ret;
+	}
 	return 0;
 }
 
 static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
 	int ret, i;
-	struct kretprobe_trace_entry field;
+	struct kretprobe_trace_entry_head field;
 	struct trace_probe *tp = (struct trace_probe *)event_call->data;
 
 	DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
 	DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
-	DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
 	/* Set argument names as fields */
-	for (i = 0; i < tp->nr_args; i++)
-		DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+	for (i = 0; i < tp->nr_args; i++) {
+		ret = trace_define_field(event_call, tp->args[i].type->fmttype,
+					 tp->args[i].name,
+					 sizeof(field) + tp->args[i].offset,
+					 tp->args[i].type->size,
+					 tp->args[i].type->is_signed,
+					 FILTER_OTHER);
+		if (ret)
+			return ret;
+	}
 	return 0;
 }
 
-static int __probe_event_show_format(struct trace_seq *s,
-				     struct trace_probe *tp, const char *fmt,
-				     const char *arg)
+static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
 {
 	int i;
+	int pos = 0;
 
-	/* Show format */
-	if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
-		return 0;
+	const char *fmt, *arg;
 
-	for (i = 0; i < tp->nr_args; i++)
-		if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
-			return 0;
-
-	if (!trace_seq_printf(s, "\", %s", arg))
-		return 0;
+	if (!probe_is_return(tp)) {
+		fmt = "(%lx)";
+		arg = "REC->" FIELD_STRING_IP;
+	} else {
+		fmt = "(%lx <- %lx)";
+		arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+	}
 
-	for (i = 0; i < tp->nr_args; i++)
-		if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
-			return 0;
+	/* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
 
-	return trace_seq_puts(s, "\n");
-}
+	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
 
-#undef SHOW_FIELD
-#define SHOW_FIELD(type, item, name)					\
-	do {								\
-		ret = trace_seq_printf(s, "\tfield:" #type " %s;\t"	\
-				"offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
-				(unsigned int)offsetof(typeof(field), item),\
-				(unsigned int)sizeof(type),		\
-				is_signed_type(type));			\
-		if (!ret)						\
-			return 0;					\
-	} while (0)
+	for (i = 0; i < tp->nr_args; i++) {
+		pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
+				tp->args[i].name, tp->args[i].type->fmt);
+	}
 
-static int kprobe_event_show_format(struct ftrace_event_call *call,
-				    struct trace_seq *s)
-{
-	struct kprobe_trace_entry field __attribute__((unused));
-	int ret, i;
-	struct trace_probe *tp = (struct trace_probe *)call->data;
+	pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
 
-	SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
-	SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+	for (i = 0; i < tp->nr_args; i++) {
+		if (strcmp(tp->args[i].type->name, "string") == 0)
+			pos += snprintf(buf + pos, LEN_OR_ZERO,
+					", __get_str(%s)",
+					tp->args[i].name);
+		else
+			pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+					tp->args[i].name);
+	}
 
-	/* Show fields */
-	for (i = 0; i < tp->nr_args; i++)
-		SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
-	trace_seq_puts(s, "\n");
+#undef LEN_OR_ZERO
 
-	return __probe_event_show_format(s, tp, "(%lx)",
-					 "REC->" FIELD_STRING_IP);
+	/* return the length of print_fmt */
+	return pos;
 }
 
-static int kretprobe_event_show_format(struct ftrace_event_call *call,
-				       struct trace_seq *s)
+static int set_print_fmt(struct trace_probe *tp)
 {
-	struct kretprobe_trace_entry field __attribute__((unused));
-	int ret, i;
-	struct trace_probe *tp = (struct trace_probe *)call->data;
+	int len;
+	char *print_fmt;
 
-	SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
-	SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
-	SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+	/* First: called with 0 length to calculate the needed length */
+	len = __set_print_fmt(tp, NULL, 0);
+	print_fmt = kmalloc(len + 1, GFP_KERNEL);
+	if (!print_fmt)
+		return -ENOMEM;
 
-	/* Show fields */
-	for (i = 0; i < tp->nr_args; i++)
-		SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
-	trace_seq_puts(s, "\n");
+	/* Second: actually write the @print_fmt */
+	__set_print_fmt(tp, print_fmt, len + 1);
+	tp->call.print_fmt = print_fmt;
 
-	return __probe_event_show_format(s, tp, "(%lx <- %lx)",
-					 "REC->" FIELD_STRING_FUNC
-					 ", REC->" FIELD_STRING_RETIP);
+	return 0;
 }
 
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static __kprobes int kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_perf_func(struct kprobe *kp,
 					 struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
-	struct kprobe_trace_entry *entry;
-	struct trace_entry *ent;
-	int size, __size, i, pc, __cpu;
-	unsigned long irq_flags;
-	char *trace_buf;
-	char *raw_data;
+	struct kprobe_trace_entry_head *entry;
+	struct hlist_head *head;
+	int size, __size, dsize;
 	int rctx;
 
-	pc = preempt_count();
-	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+	dsize = __get_data_size(tp, regs);
+	__size = sizeof(*entry) + tp->size + dsize;
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
-	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 		     "profile buffer not large enough"))
-		return 0;
-
-	/*
-	 * Protect the non nmi buffer
-	 * This also protects the rcu read side
-	 */
-	local_irq_save(irq_flags);
-
-	rctx = perf_swevent_get_recursion_context();
-	if (rctx < 0)
-		goto end_recursion;
-
-	__cpu = smp_processor_id();
+		return;
 
-	if (in_nmi())
-		trace_buf = rcu_dereference(perf_trace_buf_nmi);
-	else
-		trace_buf = rcu_dereference(perf_trace_buf);
-
-	if (!trace_buf)
-		goto end;
-
-	raw_data = per_cpu_ptr(trace_buf, __cpu);
-
-	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
-	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-	entry = (struct kprobe_trace_entry *)raw_data;
-	ent = &entry->ent;
+	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+	if (!entry)
+		return;
 
-	tracing_generic_entry_update(ent, irq_flags, pc);
-	ent->type = call->id;
-	entry->nargs = tp->nr_args;
 	entry->ip = (unsigned long)kp->addr;
-	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
-	perf_tp_event(call->id, entry->ip, 1, entry, size);
-
-end:
-	perf_swevent_put_recursion_context(rctx);
-end_recursion:
-	local_irq_restore(irq_flags);
+	memset(&entry[1], 0, dsize);
+	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
-	return 0;
+	head = this_cpu_ptr(call->perf_events);
+	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
 }
 
 /* Kretprobe profile handler */
-static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 					    struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
-	struct kretprobe_trace_entry *entry;
-	struct trace_entry *ent;
-	int size, __size, i, pc, __cpu;
-	unsigned long irq_flags;
-	char *trace_buf;
-	char *raw_data;
+	struct kretprobe_trace_entry_head *entry;
+	struct hlist_head *head;
+	int size, __size, dsize;
 	int rctx;
 
-	pc = preempt_count();
-	__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+	dsize = __get_data_size(tp, regs);
+	__size = sizeof(*entry) + tp->size + dsize;
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
-	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 		     "profile buffer not large enough"))
-		return 0;
-
-	/*
-	 * Protect the non nmi buffer
-	 * This also protects the rcu read side
-	 */
-	local_irq_save(irq_flags);
+		return;
 
-	rctx = perf_swevent_get_recursion_context();
-	if (rctx < 0)
-		goto end_recursion;
-
-	__cpu = smp_processor_id();
-
-	if (in_nmi())
-		trace_buf = rcu_dereference(perf_trace_buf_nmi);
-	else
-		trace_buf = rcu_dereference(perf_trace_buf);
-
-	if (!trace_buf)
-		goto end;
-
-	raw_data = per_cpu_ptr(trace_buf, __cpu);
-
-	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
-	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-	entry = (struct kretprobe_trace_entry *)raw_data;
-	ent = &entry->ent;
+	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+	if (!entry)
+		return;
 
-	tracing_generic_entry_update(ent, irq_flags, pc);
-	ent->type = call->id;
-	entry->nargs = tp->nr_args;
 	entry->func = (unsigned long)tp->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
-	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
-	perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
-
-end:
-	perf_swevent_put_recursion_context(rctx);
-end_recursion:
-	local_irq_restore(irq_flags);
+	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
-	return 0;
+	head = this_cpu_ptr(call->perf_events);
+	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
 }
 
-static int probe_profile_enable(struct ftrace_event_call *call)
+static int probe_perf_enable(struct ftrace_event_call *call)
 {
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
@@ -1395,7 +1627,7 @@ static int probe_profile_enable(struct ftrace_event_call *call)
 		return enable_kprobe(&tp->rp.kp);
 }
 
-static void probe_profile_disable(struct ftrace_event_call *call)
+static void probe_perf_disable(struct ftrace_event_call *call)
 {
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
@@ -1408,8 +1640,28 @@ static void probe_profile_disable(struct ftrace_event_call *call)
 			disable_kprobe(&tp->rp.kp);
 	}
 }
-#endif	/* CONFIG_EVENT_PROFILE */
+#endif	/* CONFIG_PERF_EVENTS */
+
+static __kprobes
+int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
+{
+	switch (type) {
+	case TRACE_REG_REGISTER:
+		return probe_event_enable(event);
+	case TRACE_REG_UNREGISTER:
+		probe_event_disable(event);
+		return 0;
 
+#ifdef CONFIG_PERF_EVENTS
+	case TRACE_REG_PERF_REGISTER:
+		return probe_perf_enable(event);
+	case TRACE_REG_PERF_UNREGISTER:
+		probe_perf_disable(event);
+		return 0;
+#endif
+	}
+	return 0;
+}
 
 static __kprobes
 int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1418,10 +1670,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 
 	if (tp->flags & TP_FLAG_TRACE)
 		kprobe_trace_func(kp, regs);
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 	if (tp->flags & TP_FLAG_PROFILE)
-		kprobe_profile_func(kp, regs);
-#endif	/* CONFIG_EVENT_PROFILE */
+		kprobe_perf_func(kp, regs);
+#endif
 	return 0;	/* We don't tweek kernel, so just return 0 */
 }
 
@@ -1432,47 +1684,50 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 
 	if (tp->flags & TP_FLAG_TRACE)
 		kretprobe_trace_func(ri, regs);
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 	if (tp->flags & TP_FLAG_PROFILE)
-		kretprobe_profile_func(ri, regs);
-#endif	/* CONFIG_EVENT_PROFILE */
+		kretprobe_perf_func(ri, regs);
+#endif
 	return 0;	/* We don't tweek kernel, so just return 0 */
 }
 
+static struct trace_event_functions kretprobe_funcs = {
+	.trace		= print_kretprobe_event
+};
+
+static struct trace_event_functions kprobe_funcs = {
+	.trace		= print_kprobe_event
+};
+
 static int register_probe_event(struct trace_probe *tp)
 {
 	struct ftrace_event_call *call = &tp->call;
 	int ret;
 
 	/* Initialize ftrace_event_call */
+	INIT_LIST_HEAD(&call->class->fields);
 	if (probe_is_return(tp)) {
-		tp->event.trace = print_kretprobe_event;
-		call->raw_init = probe_event_raw_init;
-		call->show_format = kretprobe_event_show_format;
-		call->define_fields = kretprobe_event_define_fields;
+		call->event.funcs = &kretprobe_funcs;
+		call->class->define_fields = kretprobe_event_define_fields;
 	} else {
-		tp->event.trace = print_kprobe_event;
-		call->raw_init = probe_event_raw_init;
-		call->show_format = kprobe_event_show_format;
-		call->define_fields = kprobe_event_define_fields;
+		call->event.funcs = &kprobe_funcs;
+		call->class->define_fields = kprobe_event_define_fields;
 	}
-	call->event = &tp->event;
-	call->id = register_ftrace_event(&tp->event);
-	if (!call->id)
+	if (set_print_fmt(tp) < 0)
+		return -ENOMEM;
+	ret = register_ftrace_event(&call->event);
+	if (!ret) {
+		kfree(call->print_fmt);
 		return -ENODEV;
-	call->enabled = 0;
-	call->regfunc = probe_event_enable;
-	call->unregfunc = probe_event_disable;
-
-#ifdef CONFIG_EVENT_PROFILE
-	call->profile_enable = probe_profile_enable;
-	call->profile_disable = probe_profile_disable;
-#endif
+	}
+	call->flags = 0;
+	call->class->reg = kprobe_register;
 	call->data = tp;
 	ret = trace_add_event_call(call);
 	if (ret) {
 		pr_info("Failed to register kprobe event: %s\n", call->name);
-		unregister_ftrace_event(&tp->event);
+		kfree(call->print_fmt);
+		unregister_ftrace_event(&call->event);
 	}
 	return ret;
 }
@@ -1481,6 +1736,7 @@ static void unregister_probe_event(struct trace_probe *tp)
 {
 	/* tp->event is unregistered in trace_remove_event_call() */
 	trace_remove_event_call(&tp->call);
+	kfree(tp->call.print_fmt);
 }
 
 /* Make a debugfs interface for controling probe points */
@@ -1523,28 +1779,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3,
 
 static __init int kprobe_trace_self_tests_init(void)
 {
-	int ret;
+	int ret, warn = 0;
 	int (*target)(int, int, int, int, int, int);
+	struct trace_probe *tp;
 
 	target = kprobe_trace_selftest_target;
 
 	pr_info("Testing kprobe tracing: ");
 
 	ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
-				  "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
-	if (WARN_ON_ONCE(ret))
-		pr_warning("error enabling function entry\n");
+				  "$stack $stack0 +0($stack)");
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error on probing function entry.\n");
+		warn++;
+	} else {
+		/* Enable trace point */
+		tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
+		if (WARN_ON_ONCE(tp == NULL)) {
+			pr_warning("error on getting new probe.\n");
+			warn++;
+		} else
+			probe_event_enable(&tp->call);
+	}
 
 	ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
 				  "$retval");
-	if (WARN_ON_ONCE(ret))
-		pr_warning("error enabling function return\n");
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error on probing function return.\n");
+		warn++;
+	} else {
+		/* Enable trace point */
+		tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
+		if (WARN_ON_ONCE(tp == NULL)) {
+			pr_warning("error on getting new probe.\n");
+			warn++;
+		} else
+			probe_event_enable(&tp->call);
+	}
+
+	if (warn)
+		goto end;
 
 	ret = target(1, 2, 3, 4, 5, 6);
 
-	cleanup_all_probes();
+	ret = command_trace_probe("-:testprobe");
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error on deleting a probe.\n");
+		warn++;
+	}
+
+	ret = command_trace_probe("-:testprobe2");
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error on deleting a probe.\n");
+		warn++;
+	}
 
-	pr_cont("OK\n");
+end:
+	cleanup_all_probes();
+	if (warn)
+		pr_cont("NG: Some tests are failed. Please check them.\n");
+	else
+		pr_cont("OK\n");
 	return 0;
 }
 
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
deleted file mode 100644
index 94103cdcf9d..00000000000
--- a/kernel/trace/trace_ksym.c
+++ /dev/null
@@ -1,519 +0,0 @@
-/*
- * trace_ksym.c - Kernel Symbol Tracer
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2009
- */
-
-#include <linux/kallsyms.h>
-#include <linux/uaccess.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-#include <linux/hw_breakpoint.h>
-#include <asm/hw_breakpoint.h>
-
-#include <asm/atomic.h>
-
-/*
- * For now, let us restrict the no. of symbols traced simultaneously to number
- * of available hardware breakpoint registers.
- */
-#define KSYM_TRACER_MAX HBP_NUM
-
-#define KSYM_TRACER_OP_LEN 3 /* rw- */
-
-struct trace_ksym {
-	struct perf_event	**ksym_hbp;
-	struct perf_event_attr	attr;
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-	atomic64_t		counter;
-#endif
-	struct hlist_node	ksym_hlist;
-};
-
-static struct trace_array *ksym_trace_array;
-
-static unsigned int ksym_filter_entry_count;
-static unsigned int ksym_tracing_enabled;
-
-static HLIST_HEAD(ksym_filter_head);
-
-static DEFINE_MUTEX(ksym_tracer_mutex);
-
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-
-#define MAX_UL_INT 0xffffffff
-
-void ksym_collect_stats(unsigned long hbp_hit_addr)
-{
-	struct hlist_node *node;
-	struct trace_ksym *entry;
-
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
-		if (entry->attr.bp_addr == hbp_hit_addr) {
-			atomic64_inc(&entry->counter);
-			break;
-		}
-	}
-	rcu_read_unlock();
-}
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
-
-void ksym_hbp_handler(struct perf_event *hbp, int nmi,
-		      struct perf_sample_data *data,
-		      struct pt_regs *regs)
-{
-	struct ring_buffer_event *event;
-	struct ksym_trace_entry *entry;
-	struct ring_buffer *buffer;
-	int pc;
-
-	if (!ksym_tracing_enabled)
-		return;
-
-	buffer = ksym_trace_array->buffer;
-
-	pc = preempt_count();
-
-	event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
-							sizeof(*entry), 0, pc);
-	if (!event)
-		return;
-
-	entry		= ring_buffer_event_data(event);
-	entry->ip	= instruction_pointer(regs);
-	entry->type	= hw_breakpoint_type(hbp);
-	entry->addr	= hw_breakpoint_addr(hbp);
-	strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
-
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-	ksym_collect_stats(hw_breakpoint_addr(hbp));
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
-
-	trace_buffer_unlock_commit(buffer, event, 0, pc);
-}
-
-/* Valid access types are represented as
- *
- * rw- : Set Read/Write Access Breakpoint
- * -w- : Set Write Access Breakpoint
- * --- : Clear Breakpoints
- * --x : Set Execution Break points (Not available yet)
- *
- */
-static int ksym_trace_get_access_type(char *str)
-{
-	int access = 0;
-
-	if (str[0] == 'r')
-		access |= HW_BREAKPOINT_R;
-
-	if (str[1] == 'w')
-		access |= HW_BREAKPOINT_W;
-
-	if (str[2] == 'x')
-		access |= HW_BREAKPOINT_X;
-
-	switch (access) {
-	case HW_BREAKPOINT_R:
-	case HW_BREAKPOINT_W:
-	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
-		return access;
-	default:
-		return -EINVAL;
-	}
-}
-
-/*
- * There can be several possible malformed requests and we attempt to capture
- * all of them. We enumerate some of the rules
- * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
- *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
- *    <module>:<ksym_name>:<op>.
- * 2. No delimiter symbol ':' in the input string
- * 3. Spurious operator symbols or symbols not in their respective positions
- * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
- * 5. Kernel symbol not a part of /proc/kallsyms
- * 6. Duplicate requests
- */
-static int parse_ksym_trace_str(char *input_string, char **ksymname,
-							unsigned long *addr)
-{
-	int ret;
-
-	*ksymname = strsep(&input_string, ":");
-	*addr = kallsyms_lookup_name(*ksymname);
-
-	/* Check for malformed request: (2), (1) and (5) */
-	if ((!input_string) ||
-	    (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
-	    (*addr == 0))
-		return -EINVAL;;
-
-	ret = ksym_trace_get_access_type(input_string);
-
-	return ret;
-}
-
-int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
-{
-	struct trace_ksym *entry;
-	int ret = -ENOMEM;
-
-	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
-		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
-		" new requests for tracing can be accepted now.\n",
-			KSYM_TRACER_MAX);
-		return -ENOSPC;
-	}
-
-	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
-	if (!entry)
-		return -ENOMEM;
-
-	hw_breakpoint_init(&entry->attr);
-
-	entry->attr.bp_type = op;
-	entry->attr.bp_addr = addr;
-	entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
-
-	entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
-					ksym_hbp_handler);
-
-	if (IS_ERR(entry->ksym_hbp)) {
-		ret = PTR_ERR(entry->ksym_hbp);
-		printk(KERN_INFO "ksym_tracer request failed. Try again"
-					" later!!\n");
-		goto err;
-	}
-
-	hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
-	ksym_filter_entry_count++;
-
-	return 0;
-
-err:
-	kfree(entry);
-
-	return ret;
-}
-
-static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
-						size_t count, loff_t *ppos)
-{
-	struct trace_ksym *entry;
-	struct hlist_node *node;
-	struct trace_seq *s;
-	ssize_t cnt = 0;
-	int ret;
-
-	s = kmalloc(sizeof(*s), GFP_KERNEL);
-	if (!s)
-		return -ENOMEM;
-	trace_seq_init(s);
-
-	mutex_lock(&ksym_tracer_mutex);
-
-	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-		ret = trace_seq_printf(s, "%pS:",
-				(void *)(unsigned long)entry->attr.bp_addr);
-		if (entry->attr.bp_type == HW_BREAKPOINT_R)
-			ret = trace_seq_puts(s, "r--\n");
-		else if (entry->attr.bp_type == HW_BREAKPOINT_W)
-			ret = trace_seq_puts(s, "-w-\n");
-		else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
-			ret = trace_seq_puts(s, "rw-\n");
-		WARN_ON_ONCE(!ret);
-	}
-
-	cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
-
-	mutex_unlock(&ksym_tracer_mutex);
-
-	kfree(s);
-
-	return cnt;
-}
-
-static void __ksym_trace_reset(void)
-{
-	struct trace_ksym *entry;
-	struct hlist_node *node, *node1;
-
-	mutex_lock(&ksym_tracer_mutex);
-	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
-								ksym_hlist) {
-		unregister_wide_hw_breakpoint(entry->ksym_hbp);
-		ksym_filter_entry_count--;
-		hlist_del_rcu(&(entry->ksym_hlist));
-		synchronize_rcu();
-		kfree(entry);
-	}
-	mutex_unlock(&ksym_tracer_mutex);
-}
-
-static ssize_t ksym_trace_filter_write(struct file *file,
-					const char __user *buffer,
-						size_t count, loff_t *ppos)
-{
-	struct trace_ksym *entry;
-	struct hlist_node *node;
-	char *buf, *input_string, *ksymname = NULL;
-	unsigned long ksym_addr = 0;
-	int ret, op, changed = 0;
-
-	buf = kzalloc(count + 1, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	ret = -EFAULT;
-	if (copy_from_user(buf, buffer, count))
-		goto out;
-
-	buf[count] = '\0';
-	input_string = strstrip(buf);
-
-	/*
-	 * Clear all breakpoints if:
-	 * 1: echo > ksym_trace_filter
-	 * 2: echo 0 > ksym_trace_filter
-	 * 3: echo "*:---" > ksym_trace_filter
-	 */
-	if (!input_string[0] || !strcmp(input_string, "0") ||
-	    !strcmp(input_string, "*:---")) {
-		__ksym_trace_reset();
-		ret = 0;
-		goto out;
-	}
-
-	ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
-	if (ret < 0)
-		goto out;
-
-	mutex_lock(&ksym_tracer_mutex);
-
-	ret = -EINVAL;
-	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-		if (entry->attr.bp_addr == ksym_addr) {
-			/* Check for malformed request: (6) */
-			if (entry->attr.bp_type != op)
-				changed = 1;
-			else
-				goto out_unlock;
-			break;
-		}
-	}
-	if (changed) {
-		unregister_wide_hw_breakpoint(entry->ksym_hbp);
-		entry->attr.bp_type = op;
-		ret = 0;
-		if (op > 0) {
-			entry->ksym_hbp =
-				register_wide_hw_breakpoint(&entry->attr,
-					ksym_hbp_handler);
-			if (IS_ERR(entry->ksym_hbp))
-				ret = PTR_ERR(entry->ksym_hbp);
-			else
-				goto out_unlock;
-		}
-		/* Error or "symbol:---" case: drop it */
-		ksym_filter_entry_count--;
-		hlist_del_rcu(&(entry->ksym_hlist));
-		synchronize_rcu();
-		kfree(entry);
-		goto out_unlock;
-	} else {
-		/* Check for malformed request: (4) */
-		if (op)
-			ret = process_new_ksym_entry(ksymname, op, ksym_addr);
-	}
-out_unlock:
-	mutex_unlock(&ksym_tracer_mutex);
-out:
-	kfree(buf);
-	return !ret ? count : ret;
-}
-
-static const struct file_operations ksym_tracing_fops = {
-	.open		= tracing_open_generic,
-	.read		= ksym_trace_filter_read,
-	.write		= ksym_trace_filter_write,
-};
-
-static void ksym_trace_reset(struct trace_array *tr)
-{
-	ksym_tracing_enabled = 0;
-	__ksym_trace_reset();
-}
-
-static int ksym_trace_init(struct trace_array *tr)
-{
-	int cpu, ret = 0;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-	ksym_tracing_enabled = 1;
-	ksym_trace_array = tr;
-
-	return ret;
-}
-
-static void ksym_trace_print_header(struct seq_file *m)
-{
-	seq_puts(m,
-		 "#       TASK-PID   CPU#      Symbol                    "
-		 "Type    Function\n");
-	seq_puts(m,
-		 "#          |        |          |                       "
-		 " |         |\n");
-}
-
-static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *s = &iter->seq;
-	struct ksym_trace_entry *field;
-	char str[KSYM_SYMBOL_LEN];
-	int ret;
-
-	if (entry->type != TRACE_KSYM)
-		return TRACE_TYPE_UNHANDLED;
-
-	trace_assign_type(field, entry);
-
-	ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
-				entry->pid, iter->cpu, (char *)field->addr);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	switch (field->type) {
-	case HW_BREAKPOINT_R:
-		ret = trace_seq_printf(s, " R  ");
-		break;
-	case HW_BREAKPOINT_W:
-		ret = trace_seq_printf(s, " W  ");
-		break;
-	case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
-		ret = trace_seq_printf(s, " RW ");
-		break;
-	default:
-		return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	sprint_symbol(str, field->ip);
-	ret = trace_seq_printf(s, "%s\n", str);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-struct tracer ksym_tracer __read_mostly =
-{
-	.name		= "ksym_tracer",
-	.init		= ksym_trace_init,
-	.reset		= ksym_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_ksym,
-#endif
-	.print_header   = ksym_trace_print_header,
-	.print_line	= ksym_trace_output
-};
-
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-static int ksym_profile_show(struct seq_file *m, void *v)
-{
-	struct hlist_node *node;
-	struct trace_ksym *entry;
-	int access_type = 0;
-	char fn_name[KSYM_NAME_LEN];
-
-	seq_puts(m, "  Access Type ");
-	seq_puts(m, "  Symbol                                       Counter\n");
-	seq_puts(m, "  ----------- ");
-	seq_puts(m, "  ------                                       -------\n");
-
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
-
-		access_type = entry->attr.bp_type;
-
-		switch (access_type) {
-		case HW_BREAKPOINT_R:
-			seq_puts(m, "  R           ");
-			break;
-		case HW_BREAKPOINT_W:
-			seq_puts(m, "  W           ");
-			break;
-		case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
-			seq_puts(m, "  RW          ");
-			break;
-		default:
-			seq_puts(m, "  NA          ");
-		}
-
-		if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
-			seq_printf(m, "  %-36s", fn_name);
-		else
-			seq_printf(m, "  %-36s", "<NA>");
-		seq_printf(m, " %15llu\n",
-			   (unsigned long long)atomic64_read(&entry->counter));
-	}
-	rcu_read_unlock();
-
-	return 0;
-}
-
-static int ksym_profile_open(struct inode *node, struct file *file)
-{
-	return single_open(file, ksym_profile_show, NULL);
-}
-
-static const struct file_operations ksym_profile_fops = {
-	.open		= ksym_profile_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-#endif /* CONFIG_PROFILE_KSYM_TRACER */
-
-__init static int init_ksym_trace(void)
-{
-	struct dentry *d_tracer;
-
-	d_tracer = tracing_init_dentry();
-
-	trace_create_file("ksym_trace_filter", 0644, d_tracer,
-			  NULL, &ksym_tracing_fops);
-
-#ifdef CONFIG_PROFILE_KSYM_TRACER
-	trace_create_file("ksym_profile", 0444, d_tracer,
-			  NULL, &ksym_profile_fops);
-#endif
-
-	return register_tracer(&ksym_tracer);
-}
-device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0acd834659e..017fa376505 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 
 #include <asm/atomic.h>
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cd..02272baa220 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,9 +16,6 @@
 
 DECLARE_RWSEM(trace_event_mutex);
 
-DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
-EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
-
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -209,6 +206,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
 
 	return 1;
 }
+EXPORT_SYMBOL(trace_seq_putc);
 
 int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
 {
@@ -253,7 +251,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
 	void *ret;
 
 	if (s->full)
-		return 0;
+		return NULL;
 
 	if (len > ((PAGE_SIZE - 1) - s->len)) {
 		s->full = 1;
@@ -355,6 +353,21 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 }
 EXPORT_SYMBOL(ftrace_print_symbols_seq);
 
+const char *
+ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+{
+	int i;
+	const char *ret = p->buffer + p->len;
+
+	for (i = 0; i < buf_len; i++)
+		trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
+
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+EXPORT_SYMBOL(ftrace_print_hex_seq);
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
@@ -726,6 +739,9 @@ int register_ftrace_event(struct trace_event *event)
 	if (WARN_ON(!event))
 		goto out;
 
+	if (WARN_ON(!event->funcs))
+		goto out;
+
 	INIT_LIST_HEAD(&event->list);
 
 	if (!event->type) {
@@ -758,14 +774,14 @@ int register_ftrace_event(struct trace_event *event)
 			goto out;
 	}
 
-	if (event->trace == NULL)
-		event->trace = trace_nop_print;
-	if (event->raw == NULL)
-		event->raw = trace_nop_print;
-	if (event->hex == NULL)
-		event->hex = trace_nop_print;
-	if (event->binary == NULL)
-		event->binary = trace_nop_print;
+	if (event->funcs->trace == NULL)
+		event->funcs->trace = trace_nop_print;
+	if (event->funcs->raw == NULL)
+		event->funcs->raw = trace_nop_print;
+	if (event->funcs->hex == NULL)
+		event->funcs->hex = trace_nop_print;
+	if (event->funcs->binary == NULL)
+		event->funcs->binary = trace_nop_print;
 
 	key = event->type & (EVENT_HASHSIZE - 1);
 
@@ -807,13 +823,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
  * Standard events
  */
 
-enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
+				  struct trace_event *event)
 {
 	return TRACE_TYPE_HANDLED;
 }
 
 /* TRACE_FN */
-static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
+					struct trace_event *event)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -840,7 +858,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
+				      struct trace_event *event)
 {
 	struct ftrace_entry *field;
 
@@ -854,7 +873,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
+				      struct trace_event *event)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -867,7 +887,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
+				      struct trace_event *event)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -880,14 +901,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static struct trace_event trace_fn_event = {
-	.type		= TRACE_FN,
+static struct trace_event_functions trace_fn_funcs = {
 	.trace		= trace_fn_trace,
 	.raw		= trace_fn_raw,
 	.hex		= trace_fn_hex,
 	.binary		= trace_fn_bin,
 };
 
+static struct trace_event trace_fn_event = {
+	.type		= TRACE_FN,
+	.funcs		= &trace_fn_funcs,
+};
+
 /* TRACE_CTX an TRACE_WAKE */
 static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 					     char *delim)
@@ -916,13 +941,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 	return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
+					 struct trace_event *event)
 {
 	return trace_ctxwake_print(iter, "==>");
 }
 
 static enum print_line_t trace_wake_print(struct trace_iterator *iter,
-					  int flags)
+					  int flags, struct trace_event *event)
 {
 	return trace_ctxwake_print(iter, "  +");
 }
@@ -950,12 +976,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 	return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
+				       struct trace_event *event)
 {
 	return trace_ctxwake_raw(iter, 0);
 }
 
-static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
+					struct trace_event *event)
 {
 	return trace_ctxwake_raw(iter, '+');
 }
@@ -984,18 +1012,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
+				       struct trace_event *event)
 {
 	return trace_ctxwake_hex(iter, 0);
 }
 
-static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
+					struct trace_event *event)
 {
 	return trace_ctxwake_hex(iter, '+');
 }
 
 static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
-					   int flags)
+					   int flags, struct trace_event *event)
 {
 	struct ctx_switch_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -1012,81 +1042,34 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
 	return TRACE_TYPE_HANDLED;
 }
 
-static struct trace_event trace_ctx_event = {
-	.type		= TRACE_CTX,
+static struct trace_event_functions trace_ctx_funcs = {
 	.trace		= trace_ctx_print,
 	.raw		= trace_ctx_raw,
 	.hex		= trace_ctx_hex,
 	.binary		= trace_ctxwake_bin,
 };
 
-static struct trace_event trace_wake_event = {
-	.type		= TRACE_WAKE,
+static struct trace_event trace_ctx_event = {
+	.type		= TRACE_CTX,
+	.funcs		= &trace_ctx_funcs,
+};
+
+static struct trace_event_functions trace_wake_funcs = {
 	.trace		= trace_wake_print,
 	.raw		= trace_wake_raw,
 	.hex		= trace_wake_hex,
 	.binary		= trace_ctxwake_bin,
 };
 
-/* TRACE_SPECIAL */
-static enum print_line_t trace_special_print(struct trace_iterator *iter,
-					     int flags)
-{
-	struct special_entry *field;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
-			      field->arg1,
-			      field->arg2,
-			      field->arg3))
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t trace_special_hex(struct trace_iterator *iter,
-					   int flags)
-{
-	struct special_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
-	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
-	SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t trace_special_bin(struct trace_iterator *iter,
-					   int flags)
-{
-	struct special_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	SEQ_PUT_FIELD_RET(s, field->arg1);
-	SEQ_PUT_FIELD_RET(s, field->arg2);
-	SEQ_PUT_FIELD_RET(s, field->arg3);
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static struct trace_event trace_special_event = {
-	.type		= TRACE_SPECIAL,
-	.trace		= trace_special_print,
-	.raw		= trace_special_print,
-	.hex		= trace_special_hex,
-	.binary		= trace_special_bin,
+static struct trace_event trace_wake_event = {
+	.type		= TRACE_WAKE,
+	.funcs		= &trace_wake_funcs,
 };
 
 /* TRACE_STACK */
 
 static enum print_line_t trace_stack_print(struct trace_iterator *iter,
-					   int flags)
+					   int flags, struct trace_event *event)
 {
 	struct stack_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -1114,17 +1097,18 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
+static struct trace_event_functions trace_stack_funcs = {
+	.trace		= trace_stack_print,
+};
+
 static struct trace_event trace_stack_event = {
 	.type		= TRACE_STACK,
-	.trace		= trace_stack_print,
-	.raw		= trace_special_print,
-	.hex		= trace_special_hex,
-	.binary		= trace_special_bin,
+	.funcs		= &trace_stack_funcs,
 };
 
 /* TRACE_USER_STACK */
 static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
-						int flags)
+						int flags, struct trace_event *event)
 {
 	struct userstack_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -1143,17 +1127,19 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
+static struct trace_event_functions trace_user_stack_funcs = {
+	.trace		= trace_user_stack_print,
+};
+
 static struct trace_event trace_user_stack_event = {
 	.type		= TRACE_USER_STACK,
-	.trace		= trace_user_stack_print,
-	.raw		= trace_special_print,
-	.hex		= trace_special_hex,
-	.binary		= trace_special_bin,
+	.funcs		= &trace_user_stack_funcs,
 };
 
 /* TRACE_BPRINT */
 static enum print_line_t
-trace_bprint_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags,
+		   struct trace_event *event)
 {
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *s = &iter->seq;
@@ -1178,7 +1164,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
 
 
 static enum print_line_t
-trace_bprint_raw(struct trace_iterator *iter, int flags)
+trace_bprint_raw(struct trace_iterator *iter, int flags,
+		 struct trace_event *event)
 {
 	struct bprint_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -1197,16 +1184,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
+static struct trace_event_functions trace_bprint_funcs = {
+	.trace		= trace_bprint_print,
+	.raw		= trace_bprint_raw,
+};
 
 static struct trace_event trace_bprint_event = {
 	.type		= TRACE_BPRINT,
-	.trace		= trace_bprint_print,
-	.raw		= trace_bprint_raw,
+	.funcs		= &trace_bprint_funcs,
 };
 
 /* TRACE_PRINT */
 static enum print_line_t trace_print_print(struct trace_iterator *iter,
-					   int flags)
+					   int flags, struct trace_event *event)
 {
 	struct print_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -1225,7 +1215,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
+					 struct trace_event *event)
 {
 	struct print_entry *field;
 
@@ -1240,18 +1231,21 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static struct trace_event trace_print_event = {
-	.type	 	= TRACE_PRINT,
+static struct trace_event_functions trace_print_funcs = {
 	.trace		= trace_print_print,
 	.raw		= trace_print_raw,
 };
 
+static struct trace_event trace_print_event = {
+	.type	 	= TRACE_PRINT,
+	.funcs		= &trace_print_funcs,
+};
+
 
 static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
 	&trace_wake_event,
-	&trace_special_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_bprint_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9d91c72ba38..c038eba0492 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
 extern struct trace_event *ftrace_find_event(int type);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
-					 int flags);
+					 int flags, struct trace_event *event);
 extern int
 trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
 
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde..8f758d070c4 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 }
 
 static void
-probe_sched_switch(struct rq *__rq, struct task_struct *prev,
-			struct task_struct *next)
+probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array_cpu *data;
 	unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 }
 
 static void
-probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
 {
 	struct trace_array_cpu *data;
 	unsigned long flags;
@@ -139,21 +138,21 @@ static int tracing_sched_register(void)
 {
 	int ret;
 
-	ret = register_trace_sched_wakeup(probe_sched_wakeup);
+	ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
 	if (ret) {
 		pr_info("wakeup trace: Couldn't activate tracepoint"
 			" probe to kernel_sched_wakeup\n");
 		return ret;
 	}
 
-	ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
+	ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
 	if (ret) {
 		pr_info("wakeup trace: Couldn't activate tracepoint"
 			" probe to kernel_sched_wakeup_new\n");
 		goto fail_deprobe;
 	}
 
-	ret = register_trace_sched_switch(probe_sched_switch);
+	ret = register_trace_sched_switch(probe_sched_switch, NULL);
 	if (ret) {
 		pr_info("sched trace: Couldn't activate tracepoint"
 			" probe to kernel_sched_switch\n");
@@ -162,17 +161,17 @@ static int tracing_sched_register(void)
 
 	return ret;
 fail_deprobe_wake_new:
-	unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+	unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
 fail_deprobe:
-	unregister_trace_sched_wakeup(probe_sched_wakeup);
+	unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
 	return ret;
 }
 
 static void tracing_sched_unregister(void)
 {
-	unregister_trace_sched_switch(probe_sched_switch);
-	unregister_trace_sched_wakeup_new(probe_sched_wakeup);
-	unregister_trace_sched_wakeup(probe_sched_wakeup);
+	unregister_trace_sched_switch(probe_sched_switch, NULL);
+	unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
+	unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
 }
 
 static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8..4086eae6e81 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
-	int resched;
 	int cpu;
 	int pc;
 
@@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 		return;
 
 	pc = preempt_count();
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 
 	cpu = raw_smp_processor_id();
 	if (cpu != wakeup_current_cpu)
@@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
  out:
 	atomic_dec(&data->disabled);
  out_enable:
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
@@ -98,7 +97,8 @@ static int report_latency(cycle_t delta)
 	return 1;
 }
 
-static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
+static void
+probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
 {
 	if (task != wakeup_task)
 		return;
@@ -107,8 +107,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
 }
 
 static void notrace
-probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
-	struct task_struct *next)
+probe_wakeup_sched_switch(void *ignore,
+			  struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array_cpu *data;
 	cycle_t T0, T1, delta;
@@ -200,7 +200,7 @@ static void wakeup_reset(struct trace_array *tr)
 }
 
 static void
-probe_wakeup(struct rq *rq, struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p, int success)
 {
 	struct trace_array_cpu *data;
 	int cpu = smp_processor_id();
@@ -264,28 +264,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
 {
 	int ret;
 
-	ret = register_trace_sched_wakeup(probe_wakeup);
+	ret = register_trace_sched_wakeup(probe_wakeup, NULL);
 	if (ret) {
 		pr_info("wakeup trace: Couldn't activate tracepoint"
 			" probe to kernel_sched_wakeup\n");
 		return;
 	}
 
-	ret = register_trace_sched_wakeup_new(probe_wakeup);
+	ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
 	if (ret) {
 		pr_info("wakeup trace: Couldn't activate tracepoint"
 			" probe to kernel_sched_wakeup_new\n");
 		goto fail_deprobe;
 	}
 
-	ret = register_trace_sched_switch(probe_wakeup_sched_switch);
+	ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
 	if (ret) {
 		pr_info("sched trace: Couldn't activate tracepoint"
 			" probe to kernel_sched_switch\n");
 		goto fail_deprobe_wake_new;
 	}
 
-	ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
+	ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
 	if (ret) {
 		pr_info("wakeup trace: Couldn't activate tracepoint"
 			" probe to kernel_sched_migrate_task\n");
@@ -312,19 +312,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
 
 	return;
 fail_deprobe_wake_new:
-	unregister_trace_sched_wakeup_new(probe_wakeup);
+	unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
 fail_deprobe:
-	unregister_trace_sched_wakeup(probe_wakeup);
+	unregister_trace_sched_wakeup(probe_wakeup, NULL);
 }
 
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
 	unregister_ftrace_function(&trace_ops);
-	unregister_trace_sched_switch(probe_wakeup_sched_switch);
-	unregister_trace_sched_wakeup_new(probe_wakeup);
-	unregister_trace_sched_wakeup(probe_wakeup);
-	unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
+	unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
+	unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
+	unregister_trace_sched_wakeup(probe_wakeup, NULL);
+	unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
 }
 
 static int __wakeup_tracer_init(struct trace_array *tr)
@@ -382,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
 #endif
+	.use_max_tr	= 1,
 };
 
 static struct tracer wakeup_rt_tracer __read_mostly =
@@ -396,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
 #endif
+	.use_max_tr	= 1,
 };
 
 __init static int init_wakeup_tracer(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d6..155a415b320 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,6 +3,7 @@
 #include <linux/stringify.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 
 static inline int trace_valid_entry(struct trace_entry *entry)
 {
@@ -12,12 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_WAKE:
 	case TRACE_STACK:
 	case TRACE_PRINT:
-	case TRACE_SPECIAL:
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
-	case TRACE_HW_BRANCHES:
-	case TRACE_KSYM:
 		return 1;
 	}
 	return 0;
@@ -29,7 +27,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
 	struct trace_entry *entry;
 	unsigned int loops = 0;
 
-	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
+	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
 		entry = ring_buffer_event_data(event);
 
 		/*
@@ -255,7 +253,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 /* Maximum number of functions to trace before diagnosing a hang */
 #define GRAPH_MAX_FUNC_TEST	100000000
 
-static void __ftrace_dump(bool disable_tracing);
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
 static unsigned int graph_hang_thresh;
 
 /* Wrap the real function entry probe to avoid possible hanging */
@@ -266,7 +265,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
 		ftrace_graph_stop();
 		printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
 		if (ftrace_dump_on_oops)
-			__ftrace_dump(false);
+			__ftrace_dump(false, DUMP_ALL);
 		return 0;
 	}
 
@@ -690,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 }
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
-#ifdef CONFIG_SYSPROF_TRACER
-int
-trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
-{
-	unsigned long count;
-	int ret;
-
-	/* start the tracing */
-	ret = tracer_init(trace, tr);
-	if (ret) {
-		warn_failed_init_tracer(trace, ret);
-		return ret;
-	}
-
-	/* Sleep for a 1/10 of a second */
-	msleep(100);
-	/* stop the tracing. */
-	tracing_stop();
-	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
-	trace->reset(tr);
-	tracing_start();
-
-	if (!ret && !count) {
-		printk(KERN_CONT ".. no entries found ..");
-		ret = -1;
-	}
-
-	return ret;
-}
-#endif /* CONFIG_SYSPROF_TRACER */
-
 #ifdef CONFIG_BRANCH_TRACER
 int
 trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
@@ -754,112 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-#ifdef CONFIG_HW_BRANCH_TRACER
-int
-trace_selftest_startup_hw_branches(struct tracer *trace,
-				   struct trace_array *tr)
-{
-	struct trace_iterator *iter;
-	struct tracer tracer;
-	unsigned long count;
-	int ret;
-
-	if (!trace->open) {
-		printk(KERN_CONT "missing open function...");
-		return -1;
-	}
-
-	ret = tracer_init(trace, tr);
-	if (ret) {
-		warn_failed_init_tracer(trace, ret);
-		return ret;
-	}
-
-	/*
-	 * The hw-branch tracer needs to collect the trace from the various
-	 * cpu trace buffers - before tracing is stopped.
-	 */
-	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter)
-		return -ENOMEM;
-
-	memcpy(&tracer, trace, sizeof(tracer));
-
-	iter->trace = &tracer;
-	iter->tr = tr;
-	iter->pos = -1;
-	mutex_init(&iter->mutex);
-
-	trace->open(iter);
-
-	mutex_destroy(&iter->mutex);
-	kfree(iter);
-
-	tracing_stop();
-
-	ret = trace_test_buffer(tr, &count);
-	trace->reset(tr);
-	tracing_start();
-
-	if (!ret && !count) {
-		printk(KERN_CONT "no entries found..");
-		ret = -1;
-	}
-
-	return ret;
-}
-#endif /* CONFIG_HW_BRANCH_TRACER */
-
-#ifdef CONFIG_KSYM_TRACER
-static int ksym_selftest_dummy;
-
-int
-trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
-{
-	unsigned long count;
-	int ret;
-
-	/* start the tracing */
-	ret = tracer_init(trace, tr);
-	if (ret) {
-		warn_failed_init_tracer(trace, ret);
-		return ret;
-	}
-
-	ksym_selftest_dummy = 0;
-	/* Register the read-write tracing request */
-
-	ret = process_new_ksym_entry("ksym_selftest_dummy",
-				     HW_BREAKPOINT_R | HW_BREAKPOINT_W,
-					(unsigned long)(&ksym_selftest_dummy));
-
-	if (ret < 0) {
-		printk(KERN_CONT "ksym_trace read-write startup test failed\n");
-		goto ret_path;
-	}
-	/* Perform a read and a write operation over the dummy variable to
-	 * trigger the tracer
-	 */
-	if (ksym_selftest_dummy == 0)
-		ksym_selftest_dummy++;
-
-	/* stop the tracing. */
-	tracing_stop();
-	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
-	trace->reset(tr);
-	tracing_start();
-
-	/* read & write operations - one each is performed on the dummy variable
-	 * triggering two entries in the trace buffer
-	 */
-	if (!ret && count != 2) {
-		printk(KERN_CONT "Ksym tracer startup test failed");
-		ret = -1;
-	}
-
-ret_path:
-	return ret;
-}
-#endif /* CONFIG_KSYM_TRACER */
-
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 678a5120ee3..a6b7e0e0f3e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -110,12 +110,12 @@ static inline void check_stack(void)
 static void
 stack_trace_call(unsigned long ip, unsigned long parent_ip)
 {
-	int cpu, resched;
+	int cpu;
 
 	if (unlikely(!ftrace_enabled || stack_trace_disabled))
 		return;
 
-	resched = ftrace_preempt_disable();
+	preempt_disable_notrace();
 
 	cpu = raw_smp_processor_id();
 	/* no atomic needed, we only modify this variable by this cpu */
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
  out:
 	per_cpu(trace_active, cpu)--;
 	/* prevent recursion in schedule */
-	ftrace_preempt_enable(resched);
+	preempt_enable_notrace();
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
 	unsigned long val, flags;
 	char buf[64];
 	int ret;
+	int cpu;
 
 	if (count >= sizeof(buf))
 		return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
 		return ret;
 
 	local_irq_save(flags);
+
+	/*
+	 * In case we trace inside arch_spin_lock() or after (NMI),
+	 * we will cause circular lock, so we also need to increase
+	 * the percpu trace_active here.
+	 */
+	cpu = smp_processor_id();
+	per_cpu(trace_active, cpu)++;
+
 	arch_spin_lock(&max_stack_lock);
 	*ptr = val;
 	arch_spin_unlock(&max_stack_lock);
+
+	per_cpu(trace_active, cpu)--;
 	local_irq_restore(flags);
 
 	return count;
@@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+	int cpu;
+
 	local_irq_disable();
+
+	cpu = smp_processor_id();
+	per_cpu(trace_active, cpu)++;
+
 	arch_spin_lock(&max_stack_lock);
 
 	if (*pos == 0)
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 
 static void t_stop(struct seq_file *m, void *p)
 {
+	int cpu;
+
 	arch_spin_unlock(&max_stack_lock);
+
+	cpu = smp_processor_id();
+	per_cpu(trace_active, cpu)--;
+
 	local_irq_enable();
 }
 
@@ -225,7 +249,7 @@ static int trace_lookup_stack(struct seq_file *m, long i)
 {
 	unsigned long addr = stack_dump_trace[i];
 
-	return seq_printf(m, "%pF\n", (void *)addr);
+	return seq_printf(m, "%pS\n", (void *)addr);
 }
 
 static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index a4bb239eb98..96cffb269e7 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,6 +10,7 @@
 
 
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/debugfs.h>
 #include "trace_stat.h"
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 75289f372dd..bac752f0cfb 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
 #include <trace/syscall.h>
 #include <trace/events/syscalls.h>
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
@@ -14,6 +15,55 @@ static int sys_refcount_exit;
 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
 
+static int syscall_enter_register(struct ftrace_event_call *event,
+				 enum trace_reg type);
+static int syscall_exit_register(struct ftrace_event_call *event,
+				 enum trace_reg type);
+
+static int syscall_enter_define_fields(struct ftrace_event_call *call);
+static int syscall_exit_define_fields(struct ftrace_event_call *call);
+
+/* All syscall exit events have the same fields */
+static LIST_HEAD(syscall_exit_fields);
+
+static struct list_head *
+syscall_get_enter_fields(struct ftrace_event_call *call)
+{
+	struct syscall_metadata *entry = call->data;
+
+	return &entry->enter_fields;
+}
+
+static struct list_head *
+syscall_get_exit_fields(struct ftrace_event_call *call)
+{
+	return &syscall_exit_fields;
+}
+
+struct trace_event_functions enter_syscall_print_funcs = {
+	.trace                  = print_syscall_enter,
+};
+
+struct trace_event_functions exit_syscall_print_funcs = {
+	.trace                  = print_syscall_exit,
+};
+
+struct ftrace_event_class event_class_syscall_enter = {
+	.system			= "syscalls",
+	.reg			= syscall_enter_register,
+	.define_fields		= syscall_enter_define_fields,
+	.get_fields		= syscall_get_enter_fields,
+	.raw_init		= init_syscall_trace,
+};
+
+struct ftrace_event_class event_class_syscall_exit = {
+	.system			= "syscalls",
+	.reg			= syscall_exit_register,
+	.define_fields		= syscall_exit_define_fields,
+	.get_fields		= syscall_get_exit_fields,
+	.raw_init		= init_syscall_trace,
+};
+
 extern unsigned long __start_syscalls_metadata[];
 extern unsigned long __stop_syscalls_metadata[];
 
@@ -52,7 +102,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
 }
 
 enum print_line_t
-print_syscall_enter(struct trace_iterator *iter, int flags)
+print_syscall_enter(struct trace_iterator *iter, int flags,
+		    struct trace_event *event)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *ent = iter->ent;
@@ -67,7 +118,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
 	if (!entry)
 		goto end;
 
-	if (entry->enter_event->id != ent->type) {
+	if (entry->enter_event->event.type != ent->type) {
 		WARN_ON_ONCE(1);
 		goto end;
 	}
@@ -104,7 +155,8 @@ end:
 }
 
 enum print_line_t
-print_syscall_exit(struct trace_iterator *iter, int flags)
+print_syscall_exit(struct trace_iterator *iter, int flags,
+		   struct trace_event *event)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *ent = iter->ent;
@@ -122,7 +174,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
 		return TRACE_TYPE_HANDLED;
 	}
 
-	if (entry->exit_event->id != ent->type) {
+	if (entry->exit_event->event.type != ent->type) {
 		WARN_ON_ONCE(1);
 		return TRACE_TYPE_UNHANDLED;
 	}
@@ -143,73 +195,68 @@ extern char *__bad_type_size(void);
 		#type, #name, offsetof(typeof(trace), name),		\
 		sizeof(trace.name), is_signed_type(type)
 
-int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
+static
+int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 {
 	int i;
-	int ret;
-	struct syscall_metadata *entry = call->data;
-	struct syscall_trace_enter trace;
-	int offset = offsetof(struct syscall_trace_enter, args);
+	int pos = 0;
 
-	ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
-			       "\tsigned:%u;\n",
-			       SYSCALL_FIELD(int, nr));
-	if (!ret)
-		return 0;
+	/* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
 
+	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
 	for (i = 0; i < entry->nb_args; i++) {
-		ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
-				        entry->args[i]);
-		if (!ret)
-			return 0;
-		ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
-				       "\tsigned:%u;\n", offset,
-				       sizeof(unsigned long),
-				       is_signed_type(unsigned long));
-		if (!ret)
-			return 0;
-		offset += sizeof(unsigned long);
+		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
+				entry->args[i], sizeof(unsigned long),
+				i == entry->nb_args - 1 ? "" : ", ");
 	}
+	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
 
-	trace_seq_puts(s, "\nprint fmt: \"");
 	for (i = 0; i < entry->nb_args; i++) {
-		ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
-				        sizeof(unsigned long),
-					i == entry->nb_args - 1 ? "" : ", ");
-		if (!ret)
-			return 0;
+		pos += snprintf(buf + pos, LEN_OR_ZERO,
+				", ((unsigned long)(REC->%s))", entry->args[i]);
 	}
-	trace_seq_putc(s, '"');
 
-	for (i = 0; i < entry->nb_args; i++) {
-		ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
-				       entry->args[i]);
-		if (!ret)
-			return 0;
-	}
+#undef LEN_OR_ZERO
 
-	return trace_seq_putc(s, '\n');
+	/* return the length of print_fmt */
+	return pos;
 }
 
-int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
+static int set_syscall_print_fmt(struct ftrace_event_call *call)
 {
-	int ret;
-	struct syscall_trace_exit trace;
+	char *print_fmt;
+	int len;
+	struct syscall_metadata *entry = call->data;
 
-	ret = trace_seq_printf(s,
-			       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
-			       "\tsigned:%u;\n"
-			       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
-			       "\tsigned:%u;\n",
-			       SYSCALL_FIELD(int, nr),
-			       SYSCALL_FIELD(long, ret));
-	if (!ret)
+	if (entry->enter_event != call) {
+		call->print_fmt = "\"0x%lx\", REC->ret";
 		return 0;
+	}
+
+	/* First: called with 0 length to calculate the needed length */
+	len = __set_enter_print_fmt(entry, NULL, 0);
+
+	print_fmt = kmalloc(len + 1, GFP_KERNEL);
+	if (!print_fmt)
+		return -ENOMEM;
 
-	return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
+	/* Second: actually write the @print_fmt */
+	__set_enter_print_fmt(entry, print_fmt, len + 1);
+	call->print_fmt = print_fmt;
+
+	return 0;
+}
+
+static void free_syscall_print_fmt(struct ftrace_event_call *call)
+{
+	struct syscall_metadata *entry = call->data;
+
+	if (entry->enter_event == call)
+		kfree(call->print_fmt);
 }
 
-int syscall_enter_define_fields(struct ftrace_event_call *call)
+static int syscall_enter_define_fields(struct ftrace_event_call *call)
 {
 	struct syscall_trace_enter trace;
 	struct syscall_metadata *meta = call->data;
@@ -232,7 +279,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
 	return ret;
 }
 
-int syscall_exit_define_fields(struct ftrace_event_call *call)
+static int syscall_exit_define_fields(struct ftrace_event_call *call)
 {
 	struct syscall_trace_exit trace;
 	int ret;
@@ -247,7 +294,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
 	return ret;
 }
 
-void ftrace_syscall_enter(struct pt_regs *regs, long id)
+void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_trace_enter *entry;
 	struct syscall_metadata *sys_data;
@@ -269,7 +316,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
 	event = trace_current_buffer_lock_reserve(&buffer,
-			sys_data->enter_event->id, size, 0, 0);
+			sys_data->enter_event->event.type, size, 0, 0);
 	if (!event)
 		return;
 
@@ -282,7 +329,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
 		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
 
-void ftrace_syscall_exit(struct pt_regs *regs, long ret)
+void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
 	struct syscall_trace_exit *entry;
 	struct syscall_metadata *sys_data;
@@ -301,7 +348,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
 		return;
 
 	event = trace_current_buffer_lock_reserve(&buffer,
-			sys_data->exit_event->id, sizeof(*entry), 0, 0);
+			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
 	if (!event)
 		return;
 
@@ -324,7 +371,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
 	if (!sys_refcount_enter)
-		ret = register_trace_sys_enter(ftrace_syscall_enter);
+		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
 	if (!ret) {
 		set_bit(num, enabled_enter_syscalls);
 		sys_refcount_enter++;
@@ -344,7 +391,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
 	sys_refcount_enter--;
 	clear_bit(num, enabled_enter_syscalls);
 	if (!sys_refcount_enter)
-		unregister_trace_sys_enter(ftrace_syscall_enter);
+		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
 	mutex_unlock(&syscall_trace_lock);
 }
 
@@ -358,7 +405,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
 	if (!sys_refcount_exit)
-		ret = register_trace_sys_exit(ftrace_syscall_exit);
+		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
 	if (!ret) {
 		set_bit(num, enabled_exit_syscalls);
 		sys_refcount_exit++;
@@ -378,7 +425,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 	sys_refcount_exit--;
 	clear_bit(num, enabled_exit_syscalls);
 	if (!sys_refcount_exit)
-		unregister_trace_sys_exit(ftrace_syscall_exit);
+		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
 	mutex_unlock(&syscall_trace_lock);
 }
 
@@ -386,12 +433,22 @@ int init_syscall_trace(struct ftrace_event_call *call)
 {
 	int id;
 
-	id = register_ftrace_event(call->event);
-	if (!id)
-		return -ENODEV;
-	call->id = id;
-	INIT_LIST_HEAD(&call->fields);
-	return 0;
+	if (set_syscall_print_fmt(call) < 0)
+		return -ENOMEM;
+
+	id = trace_event_raw_init(call);
+
+	if (id < 0) {
+		free_syscall_print_fmt(call);
+		return id;
+	}
+
+	return id;
+}
+
+unsigned long __init arch_syscall_addr(int nr)
+{
+	return (unsigned long)sys_call_table[nr];
 }
 
 int __init init_ftrace_syscalls(void)
@@ -421,27 +478,24 @@ int __init init_ftrace_syscalls(void)
 }
 core_initcall(init_ftrace_syscalls);
 
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 
-static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
-static int sys_prof_refcount_enter;
-static int sys_prof_refcount_exit;
+static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
+static int sys_perf_refcount_enter;
+static int sys_perf_refcount_exit;
 
-static void prof_syscall_enter(struct pt_regs *regs, long id)
+static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
-	unsigned long flags;
-	char *trace_buf;
-	char *raw_data;
+	struct hlist_head *head;
 	int syscall_nr;
 	int rctx;
 	int size;
-	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
-	if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
+	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
@@ -453,44 +507,24 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
-		      "profile buffer not large enough"))
+	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+		      "perf buffer not large enough"))
 		return;
 
-	/* Protect the per cpu buffer, begin the rcu read side */
-	local_irq_save(flags);
-
-	rctx = perf_swevent_get_recursion_context();
-	if (rctx < 0)
-		goto end_recursion;
-
-	cpu = smp_processor_id();
-
-	trace_buf = rcu_dereference(perf_trace_buf);
-
-	if (!trace_buf)
-		goto end;
-
-	raw_data = per_cpu_ptr(trace_buf, cpu);
-
-	/* zero the dead bytes from align to not leak stack to user */
-	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
+				sys_data->enter_event->event.type, regs, &rctx);
+	if (!rec)
+		return;
 
-	rec = (struct syscall_trace_enter *) raw_data;
-	tracing_generic_entry_update(&rec->ent, 0, 0);
-	rec->ent.type = sys_data->enter_event->id;
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
-	perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
 
-end:
-	perf_swevent_put_recursion_context(rctx);
-end_recursion:
-	local_irq_restore(flags);
+	head = this_cpu_ptr(sys_data->enter_event->perf_events);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
 }
 
-int prof_sysenter_enable(struct ftrace_event_call *call)
+int perf_sysenter_enable(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
@@ -498,47 +532,44 @@ int prof_sysenter_enable(struct ftrace_event_call *call)
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
-	if (!sys_prof_refcount_enter)
-		ret = register_trace_sys_enter(prof_syscall_enter);
+	if (!sys_perf_refcount_enter)
+		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
 	if (ret) {
 		pr_info("event trace: Could not activate"
 				"syscall entry trace point");
 	} else {
-		set_bit(num, enabled_prof_enter_syscalls);
-		sys_prof_refcount_enter++;
+		set_bit(num, enabled_perf_enter_syscalls);
+		sys_perf_refcount_enter++;
 	}
 	mutex_unlock(&syscall_trace_lock);
 	return ret;
 }
 
-void prof_sysenter_disable(struct ftrace_event_call *call)
+void perf_sysenter_disable(struct ftrace_event_call *call)
 {
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
-	sys_prof_refcount_enter--;
-	clear_bit(num, enabled_prof_enter_syscalls);
-	if (!sys_prof_refcount_enter)
-		unregister_trace_sys_enter(prof_syscall_enter);
+	sys_perf_refcount_enter--;
+	clear_bit(num, enabled_perf_enter_syscalls);
+	if (!sys_perf_refcount_enter)
+		unregister_trace_sys_enter(perf_syscall_enter, NULL);
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static void prof_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
-	unsigned long flags;
+	struct hlist_head *head;
 	int syscall_nr;
-	char *trace_buf;
-	char *raw_data;
 	int rctx;
 	int size;
-	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
-	if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
+	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
@@ -553,45 +584,23 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	 * Impossible, but be paranoid with the future
 	 * How to put this check outside runtime?
 	 */
-	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
-		"exit event has grown above profile buffer size"))
+	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+		"exit event has grown above perf buffer size"))
 		return;
 
-	/* Protect the per cpu buffer, begin the rcu read side */
-	local_irq_save(flags);
-
-	rctx = perf_swevent_get_recursion_context();
-	if (rctx < 0)
-		goto end_recursion;
-
-	cpu = smp_processor_id();
-
-	trace_buf = rcu_dereference(perf_trace_buf);
-
-	if (!trace_buf)
-		goto end;
-
-	raw_data = per_cpu_ptr(trace_buf, cpu);
-
-	/* zero the dead bytes from align to not leak stack to user */
-	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-
-	rec = (struct syscall_trace_exit *)raw_data;
+	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
+				sys_data->exit_event->event.type, regs, &rctx);
+	if (!rec)
+		return;
 
-	tracing_generic_entry_update(&rec->ent, 0, 0);
-	rec->ent.type = sys_data->exit_event->id;
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
-
-end:
-	perf_swevent_put_recursion_context(rctx);
-end_recursion:
-	local_irq_restore(flags);
+	head = this_cpu_ptr(sys_data->exit_event->perf_events);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
 }
 
-int prof_sysexit_enable(struct ftrace_event_call *call)
+int perf_sysexit_enable(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
@@ -599,33 +608,73 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
-	if (!sys_prof_refcount_exit)
-		ret = register_trace_sys_exit(prof_syscall_exit);
+	if (!sys_perf_refcount_exit)
+		ret = register_trace_sys_exit(perf_syscall_exit, NULL);
 	if (ret) {
 		pr_info("event trace: Could not activate"
-				"syscall entry trace point");
+				"syscall exit trace point");
 	} else {
-		set_bit(num, enabled_prof_exit_syscalls);
-		sys_prof_refcount_exit++;
+		set_bit(num, enabled_perf_exit_syscalls);
+		sys_perf_refcount_exit++;
 	}
 	mutex_unlock(&syscall_trace_lock);
 	return ret;
 }
 
-void prof_sysexit_disable(struct ftrace_event_call *call)
+void perf_sysexit_disable(struct ftrace_event_call *call)
 {
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
-	sys_prof_refcount_exit--;
-	clear_bit(num, enabled_prof_exit_syscalls);
-	if (!sys_prof_refcount_exit)
-		unregister_trace_sys_exit(prof_syscall_exit);
+	sys_perf_refcount_exit--;
+	clear_bit(num, enabled_perf_exit_syscalls);
+	if (!sys_perf_refcount_exit)
+		unregister_trace_sys_exit(perf_syscall_exit, NULL);
 	mutex_unlock(&syscall_trace_lock);
 }
 
+#endif /* CONFIG_PERF_EVENTS */
+
+static int syscall_enter_register(struct ftrace_event_call *event,
+				 enum trace_reg type)
+{
+	switch (type) {
+	case TRACE_REG_REGISTER:
+		return reg_event_syscall_enter(event);
+	case TRACE_REG_UNREGISTER:
+		unreg_event_syscall_enter(event);
+		return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+	case TRACE_REG_PERF_REGISTER:
+		return perf_sysenter_enable(event);
+	case TRACE_REG_PERF_UNREGISTER:
+		perf_sysenter_disable(event);
+		return 0;
 #endif
+	}
+	return 0;
+}
 
+static int syscall_exit_register(struct ftrace_event_call *event,
+				 enum trace_reg type)
+{
+	switch (type) {
+	case TRACE_REG_REGISTER:
+		return reg_event_syscall_exit(event);
+	case TRACE_REG_UNREGISTER:
+		unreg_event_syscall_exit(event);
+		return 0;
 
+#ifdef CONFIG_PERF_EVENTS
+	case TRACE_REG_PERF_REGISTER:
+		return perf_sysexit_enable(event);
+	case TRACE_REG_PERF_UNREGISTER:
+		perf_sysexit_disable(event);
+		return 0;
+#endif
+	}
+	return 0;
+}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
deleted file mode 100644
index a7974a552ca..00000000000
--- a/kernel/trace/trace_sysprof.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/*
- * trace stack traces
- *
- * Copyright (C) 2004-2008, Soeren Sandmann
- * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
- * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/hrtimer.h>
-#include <linux/uaccess.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/irq.h>
-#include <linux/fs.h>
-
-#include <asm/stacktrace.h>
-
-#include "trace.h"
-
-static struct trace_array	*sysprof_trace;
-static int __read_mostly	tracer_enabled;
-
-/*
- * 1 msec sample interval by default:
- */
-static unsigned long sample_period = 1000000;
-static const unsigned int sample_max_depth = 512;
-
-static DEFINE_MUTEX(sample_timer_lock);
-/*
- * Per CPU hrtimers that do the profiling:
- */
-static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
-
-struct stack_frame {
-	const void __user	*next_fp;
-	unsigned long		return_address;
-};
-
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
-{
-	int ret;
-
-	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
-		return 0;
-
-	ret = 1;
-	pagefault_disable();
-	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
-		ret = 0;
-	pagefault_enable();
-
-	return ret;
-}
-
-struct backtrace_info {
-	struct trace_array_cpu	*data;
-	struct trace_array	*tr;
-	int			pos;
-};
-
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-	/* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
-	/* Ignore warnings */
-}
-
-static int backtrace_stack(void *data, char *name)
-{
-	/* Don't bother with IRQ stacks for now */
-	return -1;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
-	struct backtrace_info *info = data;
-
-	if (info->pos < sample_max_depth && reliable) {
-		__trace_special(info->tr, info->data, 1, addr, 0);
-
-		info->pos++;
-	}
-}
-
-static const struct stacktrace_ops backtrace_ops = {
-	.warning		= backtrace_warning,
-	.warning_symbol		= backtrace_warning_symbol,
-	.stack			= backtrace_stack,
-	.address		= backtrace_address,
-	.walk_stack		= print_context_stack,
-};
-
-static int
-trace_kernel(struct pt_regs *regs, struct trace_array *tr,
-	     struct trace_array_cpu *data)
-{
-	struct backtrace_info info;
-	unsigned long bp;
-	char *stack;
-
-	info.tr = tr;
-	info.data = data;
-	info.pos = 1;
-
-	__trace_special(info.tr, info.data, 1, regs->ip, 0);
-
-	stack = ((char *)regs + sizeof(struct pt_regs));
-#ifdef CONFIG_FRAME_POINTER
-	bp = regs->bp;
-#else
-	bp = 0;
-#endif
-
-	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
-
-	return info.pos;
-}
-
-static void timer_notify(struct pt_regs *regs, int cpu)
-{
-	struct trace_array_cpu *data;
-	struct stack_frame frame;
-	struct trace_array *tr;
-	const void __user *fp;
-	int is_user;
-	int i;
-
-	if (!regs)
-		return;
-
-	tr = sysprof_trace;
-	data = tr->data[cpu];
-	is_user = user_mode(regs);
-
-	if (!current || current->pid == 0)
-		return;
-
-	if (is_user && current->state != TASK_RUNNING)
-		return;
-
-	__trace_special(tr, data, 0, 0, current->pid);
-
-	if (!is_user)
-		i = trace_kernel(regs, tr, data);
-	else
-		i = 0;
-
-	/*
-	 * Trace user stack if we are not a kernel thread
-	 */
-	if (current->mm && i < sample_max_depth) {
-		regs = (struct pt_regs *)current->thread.sp0 - 1;
-
-		fp = (void __user *)regs->bp;
-
-		__trace_special(tr, data, 2, regs->ip, 0);
-
-		while (i < sample_max_depth) {
-			frame.next_fp = NULL;
-			frame.return_address = 0;
-			if (!copy_stack_frame(fp, &frame))
-				break;
-			if ((unsigned long)fp < regs->sp)
-				break;
-
-			__trace_special(tr, data, 2, frame.return_address,
-					(unsigned long)fp);
-			fp = frame.next_fp;
-
-			i++;
-		}
-
-	}
-
-	/*
-	 * Special trace entry if we overflow the max depth:
-	 */
-	if (i == sample_max_depth)
-		__trace_special(tr, data, -1, -1, -1);
-
-	__trace_special(tr, data, 3, current->pid, i);
-}
-
-static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
-{
-	/* trace here */
-	timer_notify(get_irq_regs(), smp_processor_id());
-
-	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
-
-	return HRTIMER_RESTART;
-}
-
-static void start_stack_timer(void *unused)
-{
-	struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
-
-	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hrtimer->function = stack_trace_timer_fn;
-
-	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
-		      HRTIMER_MODE_REL_PINNED);
-}
-
-static void start_stack_timers(void)
-{
-	on_each_cpu(start_stack_timer, NULL, 1);
-}
-
-static void stop_stack_timer(int cpu)
-{
-	struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
-
-	hrtimer_cancel(hrtimer);
-}
-
-static void stop_stack_timers(void)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		stop_stack_timer(cpu);
-}
-
-static void stop_stack_trace(struct trace_array *tr)
-{
-	mutex_lock(&sample_timer_lock);
-	stop_stack_timers();
-	tracer_enabled = 0;
-	mutex_unlock(&sample_timer_lock);
-}
-
-static int stack_trace_init(struct trace_array *tr)
-{
-	sysprof_trace = tr;
-
-	tracing_start_cmdline_record();
-
-	mutex_lock(&sample_timer_lock);
-	start_stack_timers();
-	tracer_enabled = 1;
-	mutex_unlock(&sample_timer_lock);
-	return 0;
-}
-
-static void stack_trace_reset(struct trace_array *tr)
-{
-	tracing_stop_cmdline_record();
-	stop_stack_trace(tr);
-}
-
-static struct tracer stack_trace __read_mostly =
-{
-	.name		= "sysprof",
-	.init		= stack_trace_init,
-	.reset		= stack_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest    = trace_selftest_startup_sysprof,
-#endif
-};
-
-__init static int init_stack_trace(void)
-{
-	return register_tracer(&stack_trace);
-}
-device_initcall(init_stack_trace);
-
-#define MAX_LONG_DIGITS 22
-
-static ssize_t
-sysprof_sample_read(struct file *filp, char __user *ubuf,
-		    size_t cnt, loff_t *ppos)
-{
-	char buf[MAX_LONG_DIGITS];
-	int r;
-
-	r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static ssize_t
-sysprof_sample_write(struct file *filp, const char __user *ubuf,
-		     size_t cnt, loff_t *ppos)
-{
-	char buf[MAX_LONG_DIGITS];
-	unsigned long val;
-
-	if (cnt > MAX_LONG_DIGITS-1)
-		cnt = MAX_LONG_DIGITS-1;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-
-	val = simple_strtoul(buf, NULL, 10);
-	/*
-	 * Enforce a minimum sample period of 100 usecs:
-	 */
-	if (val < 100)
-		val = 100;
-
-	mutex_lock(&sample_timer_lock);
-	stop_stack_timers();
-	sample_period = val * 1000;
-	start_stack_timers();
-	mutex_unlock(&sample_timer_lock);
-
-	return cnt;
-}
-
-static const struct file_operations sysprof_sample_fops = {
-	.read		= sysprof_sample_read,
-	.write		= sysprof_sample_write,
-};
-
-void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
-{
-
-	trace_create_file("sysprof_sample_period", 0644,
-			d_tracer, NULL, &sysprof_sample_fops);
-}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 40cafb07dff..a7cc3793baf 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
 #include <trace/events/workqueue.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
+#include <linux/slab.h>
 #include <linux/kref.h>
 #include "trace_stat.h"
 #include "trace.h"
@@ -48,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
 
 /* Insertion of a work */
 static void
-probe_workqueue_insertion(struct task_struct *wq_thread,
+probe_workqueue_insertion(void *ignore,
+			  struct task_struct *wq_thread,
 			  struct work_struct *work)
 {
 	int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -69,7 +71,8 @@ found:
 
 /* Execution of a work */
 static void
-probe_workqueue_execution(struct task_struct *wq_thread,
+probe_workqueue_execution(void *ignore,
+			  struct task_struct *wq_thread,
 			  struct work_struct *work)
 {
 	int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -89,7 +92,8 @@ found:
 }
 
 /* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+static void probe_workqueue_creation(void *ignore,
+				     struct task_struct *wq_thread, int cpu)
 {
 	struct cpu_workqueue_stats *cws;
 	unsigned long flags;
@@ -113,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 }
 
 /* Destruction of a cpu workqueue thread */
-static void probe_workqueue_destruction(struct task_struct *wq_thread)
+static void
+probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
 {
 	/* Workqueue only execute on one cpu */
 	int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -258,19 +263,19 @@ int __init trace_workqueue_early_init(void)
 {
 	int ret, cpu;
 
-	ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+	ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
 	if (ret)
 		goto out;
 
-	ret = register_trace_workqueue_execution(probe_workqueue_execution);
+	ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
 	if (ret)
 		goto no_insertion;
 
-	ret = register_trace_workqueue_creation(probe_workqueue_creation);
+	ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
 	if (ret)
 		goto no_execution;
 
-	ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+	ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
 	if (ret)
 		goto no_creation;
 
@@ -282,11 +287,11 @@ int __init trace_workqueue_early_init(void)
 	return 0;
 
 no_creation:
-	unregister_trace_workqueue_creation(probe_workqueue_creation);
+	unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
 no_execution:
-	unregister_trace_workqueue_execution(probe_workqueue_execution);
+	unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
 no_insertion:
-	unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+	unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
 out:
 	pr_warning("trace_workqueue: unable to trace workqueues\n");
 
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5bc0f..c77f3eceea2 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
  */
 struct tracepoint_entry {
 	struct hlist_node hlist;
-	void **funcs;
+	struct tracepoint_func *funcs;
 	int refcount;	/* Number of times armed. 0 if disarmed. */
 	char name[0];
 };
@@ -64,12 +64,12 @@ struct tp_probes {
 		struct rcu_head rcu;
 		struct list_head list;
 	} u;
-	void *probes[0];
+	struct tracepoint_func probes[0];
 };
 
 static inline void *allocate_probes(int count)
 {
-	struct tp_probes *p  = kmalloc(count * sizeof(void *)
+	struct tp_probes *p  = kmalloc(count * sizeof(struct tracepoint_func)
 			+ sizeof(struct tp_probes), GFP_KERNEL);
 	return p == NULL ? NULL : p->probes;
 }
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
 	kfree(container_of(head, struct tp_probes, u.rcu));
 }
 
-static inline void release_probes(void *old)
+static inline void release_probes(struct tracepoint_func *old)
 {
 	if (old) {
 		struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
 	if (!tracepoint_debug || !entry->funcs)
 		return;
 
-	for (i = 0; entry->funcs[i]; i++)
-		printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
+	for (i = 0; entry->funcs[i].func; i++)
+		printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
 }
 
-static void *
-tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
+static struct tracepoint_func *
+tracepoint_entry_add_probe(struct tracepoint_entry *entry,
+			   void *probe, void *data)
 {
 	int nr_probes = 0;
-	void **old, **new;
+	struct tracepoint_func *old, *new;
 
 	WARN_ON(!probe);
 
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
 	old = entry->funcs;
 	if (old) {
 		/* (N -> N+1), (N != 0, 1) probes */
-		for (nr_probes = 0; old[nr_probes]; nr_probes++)
-			if (old[nr_probes] == probe)
+		for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+			if (old[nr_probes].func == probe &&
+			    old[nr_probes].data == data)
 				return ERR_PTR(-EEXIST);
 	}
 	/* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
 	if (new == NULL)
 		return ERR_PTR(-ENOMEM);
 	if (old)
-		memcpy(new, old, nr_probes * sizeof(void *));
-	new[nr_probes] = probe;
-	new[nr_probes + 1] = NULL;
+		memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
+	new[nr_probes].func = probe;
+	new[nr_probes].data = data;
+	new[nr_probes + 1].func = NULL;
 	entry->refcount = nr_probes + 1;
 	entry->funcs = new;
 	debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
 }
 
 static void *
-tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
+tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
+			      void *probe, void *data)
 {
 	int nr_probes = 0, nr_del = 0, i;
-	void **old, **new;
+	struct tracepoint_func *old, *new;
 
 	old = entry->funcs;
 
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
 
 	debug_print_probes(entry);
 	/* (N -> M), (N > 1, M >= 0) probes */
-	for (nr_probes = 0; old[nr_probes]; nr_probes++) {
-		if ((!probe || old[nr_probes] == probe))
+	for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+		if (!probe ||
+		    (old[nr_probes].func == probe &&
+		     old[nr_probes].data == data))
 			nr_del++;
 	}
 
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
 		new = allocate_probes(nr_probes - nr_del + 1);
 		if (new == NULL)
 			return ERR_PTR(-ENOMEM);
-		for (i = 0; old[i]; i++)
-			if ((probe && old[i] != probe))
+		for (i = 0; old[i].func; i++)
+			if (probe &&
+			    (old[i].func != probe || old[i].data != data))
 				new[j++] = old[i];
-		new[nr_probes - nr_del] = NULL;
+		new[nr_probes - nr_del].func = NULL;
 		entry->refcount = nr_probes - nr_del;
 		entry->funcs = new;
 	}
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
 	module_update_tracepoints();
 }
 
-static void *tracepoint_add_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_add_probe(const char *name, void *probe, void *data)
 {
 	struct tracepoint_entry *entry;
-	void *old;
+	struct tracepoint_func *old;
 
 	entry = get_tracepoint(name);
 	if (!entry) {
 		entry = add_tracepoint(name);
 		if (IS_ERR(entry))
-			return entry;
+			return (struct tracepoint_func *)entry;
 	}
-	old = tracepoint_entry_add_probe(entry, probe);
+	old = tracepoint_entry_add_probe(entry, probe, data);
 	if (IS_ERR(old) && !entry->refcount)
 		remove_tracepoint(entry);
 	return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
  * Returns 0 if ok, error value on error.
  * The probe address must at least be aligned on the architecture pointer size.
  */
-int tracepoint_probe_register(const char *name, void *probe)
+int tracepoint_probe_register(const char *name, void *probe, void *data)
 {
-	void *old;
+	struct tracepoint_func *old;
 
 	mutex_lock(&tracepoints_mutex);
-	old = tracepoint_add_probe(name, probe);
+	old = tracepoint_add_probe(name, probe, data);
 	mutex_unlock(&tracepoints_mutex);
 	if (IS_ERR(old))
 		return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_register);
 
-static void *tracepoint_remove_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_remove_probe(const char *name, void *probe, void *data)
 {
 	struct tracepoint_entry *entry;
-	void *old;
+	struct tracepoint_func *old;
 
 	entry = get_tracepoint(name);
 	if (!entry)
 		return ERR_PTR(-ENOENT);
-	old = tracepoint_entry_remove_probe(entry, probe);
+	old = tracepoint_entry_remove_probe(entry, probe, data);
 	if (IS_ERR(old))
 		return old;
 	if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
  * itself uses stop_machine(), which insures that every preempt disabled section
  * have finished.
  */
-int tracepoint_probe_unregister(const char *name, void *probe)
+int tracepoint_probe_unregister(const char *name, void *probe, void *data)
 {
-	void *old;
+	struct tracepoint_func *old;
 
 	mutex_lock(&tracepoints_mutex);
-	old = tracepoint_remove_probe(name, probe);
+	old = tracepoint_remove_probe(name, probe, data);
 	mutex_unlock(&tracepoints_mutex);
 	if (IS_ERR(old))
 		return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
  *
  * caller must call tracepoint_probe_update_all()
  */
-int tracepoint_probe_register_noupdate(const char *name, void *probe)
+int tracepoint_probe_register_noupdate(const char *name, void *probe,
+				       void *data)
 {
-	void *old;
+	struct tracepoint_func *old;
 
 	mutex_lock(&tracepoints_mutex);
-	old = tracepoint_add_probe(name, probe);
+	old = tracepoint_add_probe(name, probe, data);
 	if (IS_ERR(old)) {
 		mutex_unlock(&tracepoints_mutex);
 		return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
  *
  * caller must call tracepoint_probe_update_all()
  */
-int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
+int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
+					 void *data)
 {
-	void *old;
+	struct tracepoint_func *old;
 
 	mutex_lock(&tracepoints_mutex);
-	old = tracepoint_remove_probe(name, probe);
+	old = tracepoint_remove_probe(name, probe, data);
 	if (IS_ERR(old)) {
 		mutex_unlock(&tracepoints_mutex);
 		return PTR_ERR(old);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048ed..0a67e041edf 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
 #include <linux/jiffies.h>
+#include <linux/mm.h>
 
 /*
  * fill in basic accounting fields
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70..7e72614b736 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/user_namespace.h>
-#include "cred-internals.h"
 
 struct user_namespace init_user_ns = {
 	.kref = {
@@ -56,9 +55,6 @@ struct user_struct root_user = {
 	.sigpending	= ATOMIC_INIT(0),
 	.locked_shm     = 0,
 	.user_ns	= &init_user_ns,
-#ifdef CONFIG_USER_SCHED
-	.tg		= &init_task_group,
-#endif
 };
 
 /*
@@ -75,268 +71,6 @@ static void uid_hash_remove(struct user_struct *up)
 	put_user_ns(up->user_ns);
 }
 
-#ifdef CONFIG_USER_SCHED
-
-static void sched_destroy_user(struct user_struct *up)
-{
-	sched_destroy_group(up->tg);
-}
-
-static int sched_create_user(struct user_struct *up)
-{
-	int rc = 0;
-
-	up->tg = sched_create_group(&root_task_group);
-	if (IS_ERR(up->tg))
-		rc = -ENOMEM;
-
-	set_tg_uid(up);
-
-	return rc;
-}
-
-#else	/* CONFIG_USER_SCHED */
-
-static void sched_destroy_user(struct user_struct *up) { }
-static int sched_create_user(struct user_struct *up) { return 0; }
-
-#endif	/* CONFIG_USER_SCHED */
-
-#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
-
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
-{
-	struct user_struct *user;
-	struct hlist_node *h;
-
-	hlist_for_each_entry(user, h, hashent, uidhash_node) {
-		if (user->uid == uid) {
-			/* possibly resurrect an "almost deleted" object */
-			if (atomic_inc_return(&user->__count) == 1)
-				cancel_delayed_work(&user->work);
-			return user;
-		}
-	}
-
-	return NULL;
-}
-
-static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
-static DEFINE_MUTEX(uids_mutex);
-
-static inline void uids_mutex_lock(void)
-{
-	mutex_lock(&uids_mutex);
-}
-
-static inline void uids_mutex_unlock(void)
-{
-	mutex_unlock(&uids_mutex);
-}
-
-/* uid directory attributes */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static ssize_t cpu_shares_show(struct kobject *kobj,
-			       struct kobj_attribute *attr,
-			       char *buf)
-{
-	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-
-	return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
-}
-
-static ssize_t cpu_shares_store(struct kobject *kobj,
-				struct kobj_attribute *attr,
-				const char *buf, size_t size)
-{
-	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-	unsigned long shares;
-	int rc;
-
-	sscanf(buf, "%lu", &shares);
-
-	rc = sched_group_set_shares(up->tg, shares);
-
-	return (rc ? rc : size);
-}
-
-static struct kobj_attribute cpu_share_attr =
-	__ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
-				   struct kobj_attribute *attr,
-				   char *buf)
-{
-	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-
-	return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
-}
-
-static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
-				    struct kobj_attribute *attr,
-				    const char *buf, size_t size)
-{
-	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-	unsigned long rt_runtime;
-	int rc;
-
-	sscanf(buf, "%ld", &rt_runtime);
-
-	rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
-
-	return (rc ? rc : size);
-}
-
-static struct kobj_attribute cpu_rt_runtime_attr =
-	__ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
-
-static ssize_t cpu_rt_period_show(struct kobject *kobj,
-				   struct kobj_attribute *attr,
-				   char *buf)
-{
-	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-
-	return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
-}
-
-static ssize_t cpu_rt_period_store(struct kobject *kobj,
-				    struct kobj_attribute *attr,
-				    const char *buf, size_t size)
-{
-	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-	unsigned long rt_period;
-	int rc;
-
-	sscanf(buf, "%lu", &rt_period);
-
-	rc = sched_group_set_rt_period(up->tg, rt_period);
-
-	return (rc ? rc : size);
-}
-
-static struct kobj_attribute cpu_rt_period_attr =
-	__ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
-#endif
-
-/* default attributes per uid directory */
-static struct attribute *uids_attributes[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	&cpu_share_attr.attr,
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-	&cpu_rt_runtime_attr.attr,
-	&cpu_rt_period_attr.attr,
-#endif
-	NULL
-};
-
-/* the lifetime of user_struct is not managed by the core (now) */
-static void uids_release(struct kobject *kobj)
-{
-	return;
-}
-
-static struct kobj_type uids_ktype = {
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_attrs = uids_attributes,
-	.release = uids_release,
-};
-
-/*
- * Create /sys/kernel/uids/<uid>/cpu_share file for this user
- * We do not create this file for users in a user namespace (until
- * sysfs tagging is implemented).
- *
- * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
- */
-static int uids_user_create(struct user_struct *up)
-{
-	struct kobject *kobj = &up->kobj;
-	int error;
-
-	memset(kobj, 0, sizeof(struct kobject));
-	if (up->user_ns != &init_user_ns)
-		return 0;
-	kobj->kset = uids_kset;
-	error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
-	if (error) {
-		kobject_put(kobj);
-		goto done;
-	}
-
-	kobject_uevent(kobj, KOBJ_ADD);
-done:
-	return error;
-}
-
-/* create these entries in sysfs:
- * 	"/sys/kernel/uids" directory
- * 	"/sys/kernel/uids/0" directory (for root user)
- * 	"/sys/kernel/uids/0/cpu_share" file (for root user)
- */
-int __init uids_sysfs_init(void)
-{
-	uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
-	if (!uids_kset)
-		return -ENOMEM;
-
-	return uids_user_create(&root_user);
-}
-
-/* delayed work function to remove sysfs directory for a user and free up
- * corresponding structures.
- */
-static void cleanup_user_struct(struct work_struct *w)
-{
-	struct user_struct *up = container_of(w, struct user_struct, work.work);
-	unsigned long flags;
-	int remove_user = 0;
-
-	/* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
-	 * atomic.
-	 */
-	uids_mutex_lock();
-
-	spin_lock_irqsave(&uidhash_lock, flags);
-	if (atomic_read(&up->__count) == 0) {
-		uid_hash_remove(up);
-		remove_user = 1;
-	}
-	spin_unlock_irqrestore(&uidhash_lock, flags);
-
-	if (!remove_user)
-		goto done;
-
-	if (up->user_ns == &init_user_ns) {
-		kobject_uevent(&up->kobj, KOBJ_REMOVE);
-		kobject_del(&up->kobj);
-		kobject_put(&up->kobj);
-	}
-
-	sched_destroy_user(up);
-	key_put(up->uid_keyring);
-	key_put(up->session_keyring);
-	kmem_cache_free(uid_cachep, up);
-
-done:
-	uids_mutex_unlock();
-}
-
-/* IRQs are disabled and uidhash_lock is held upon function entry.
- * IRQ state (as stored in flags) is restored and uidhash_lock released
- * upon function exit.
- */
-static void free_user(struct user_struct *up, unsigned long flags)
-{
-	INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
-	schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
-	spin_unlock_irqrestore(&uidhash_lock, flags);
-}
-
-#else	/* CONFIG_USER_SCHED && CONFIG_SYSFS */
-
 static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 {
 	struct user_struct *user;
@@ -352,11 +86,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 	return NULL;
 }
 
-int uids_sysfs_init(void) { return 0; }
-static inline int uids_user_create(struct user_struct *up) { return 0; }
-static inline void uids_mutex_lock(void) { }
-static inline void uids_mutex_unlock(void) { }
-
 /* IRQs are disabled and uidhash_lock is held upon function entry.
  * IRQ state (as stored in flags) is restored and uidhash_lock released
  * upon function exit.
@@ -365,32 +94,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
 {
 	uid_hash_remove(up);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
-	sched_destroy_user(up);
 	key_put(up->uid_keyring);
 	key_put(up->session_keyring);
 	kmem_cache_free(uid_cachep, up);
 }
 
-#endif
-
-#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
-/*
- * We need to check if a setuid can take place. This function should be called
- * before successfully completing the setuid.
- */
-int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
-{
-
-	return sched_rt_can_attach(up->tg, tsk);
-
-}
-#else
-int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
-{
-	return 1;
-}
-#endif
-
 /*
  * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
  * caller must undo that ref with free_uid().
@@ -428,11 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up, *new;
 
-	/* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
-	 * atomic.
-	 */
-	uids_mutex_lock();
-
 	spin_lock_irq(&uidhash_lock);
 	up = uid_hash_find(uid, hashent);
 	spin_unlock_irq(&uidhash_lock);
@@ -445,14 +148,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 		new->uid = uid;
 		atomic_set(&new->__count, 1);
 
-		if (sched_create_user(new) < 0)
-			goto out_free_user;
-
 		new->user_ns = get_user_ns(ns);
 
-		if (uids_user_create(new))
-			goto out_destoy_sched;
-
 		/*
 		 * Before adding this, check whether we raced
 		 * on adding the same user already..
@@ -460,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
-			/* This case is not possible when CONFIG_USER_SCHED
-			 * is defined, since we serialize alloc_uid() using
-			 * uids_mutex. Hence no need to call
-			 * sched_destroy_user() or remove_user_sysfs_dir().
-			 */
 			key_put(new->uid_keyring);
 			key_put(new->session_keyring);
 			kmem_cache_free(uid_cachep, new);
@@ -475,17 +167,9 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 		spin_unlock_irq(&uidhash_lock);
 	}
 
-	uids_mutex_unlock();
-
 	return up;
 
-out_destoy_sched:
-	sched_destroy_user(new);
-	put_user_ns(new->user_ns);
-out_free_user:
-	kmem_cache_free(uid_cachep, new);
 out_unlock:
-	uids_mutex_unlock();
 	return NULL;
 }
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8215b..25915832291 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/highuid.h>
 #include <linux/cred.h>
 
 /*
@@ -54,8 +55,8 @@ int create_user_ns(struct cred *new)
 #endif
 	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
 
-	/* alloc_uid() incremented the userns refcount.  Just set it to 1 */
-	kref_set(&ns->kref, 1);
+	/* root_user holds a reference to ns, our reference can be dropped */
+	put_user_ns(ns);
 
 	return 0;
 }
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
 	schedule_work(&ns->destroyer);
 }
 EXPORT_SYMBOL(free_user_ns);
+
+uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
+{
+	struct user_namespace *tmp;
+
+	if (likely(to == cred->user->user_ns))
+		return uid;
+
+
+	/* Is cred->user the creator of the target user_ns
+	 * or the creator of one of it's parents?
+	 */
+	for ( tmp = to; tmp != &init_user_ns;
+	      tmp = tmp->creator->user_ns ) {
+		if (cred->user == tmp->creator) {
+			return (uid_t)0;
+		}
+	}
+
+	/* No useful relationship so no mapping */
+	return overflowuid;
+}
+
+gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
+{
+	struct user_namespace *tmp;
+
+	if (likely(to == cred->user->user_ns))
+		return gid;
+
+	/* Is cred->user the creator of the target user_ns
+	 * or the creator of one of it's parents?
+	 */
+	for ( tmp = to; tmp != &init_user_ns;
+	      tmp = tmp->creator->user_ns ) {
+		if (cred->user == tmp->creator) {
+			return (gid_t)0;
+		}
+	}
+
+	/* No useful relationship so no mapping */
+	return overflowgid;
+}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 00000000000..7f9c3c52ecc
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,577 @@
+/*
+ * Detect hard and soft lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * this code detects hard lockups: incidents in where on a CPU
+ * the kernel does not respond to anything except NMI.
+ *
+ * Note: Most of this code is borrowed heavily from softlockup.c,
+ * so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
+ * to those contributors as well.
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+
+int watchdog_enabled;
+int __read_mostly softlockup_thresh = 60;
+
+static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
+static DEFINE_PER_CPU(bool, softlockup_touch_sync);
+static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+#endif
+
+static int __read_mostly did_panic;
+static int __initdata no_watchdog;
+
+
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static int hardlockup_panic;
+
+static int __init hardlockup_panic_setup(char *str)
+{
+	if (!strncmp(str, "panic", 5))
+		hardlockup_panic = 1;
+	return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+#endif
+
+unsigned int __read_mostly softlockup_panic =
+			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+
+static int __init softlockup_panic_setup(char *str)
+{
+	softlockup_panic = simple_strtoul(str, NULL, 0);
+
+	return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
+static int __init nowatchdog_setup(char *str)
+{
+	no_watchdog = 1;
+	return 1;
+}
+__setup("nowatchdog", nowatchdog_setup);
+
+/* deprecated */
+static int __init nosoftlockup_setup(char *str)
+{
+	no_watchdog = 1;
+	return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+/*  */
+
+
+/*
+ * Returns seconds, approximately.  We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(int this_cpu)
+{
+	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+}
+
+static unsigned long get_sample_period(void)
+{
+	/*
+	 * convert softlockup_thresh from seconds to ns
+	 * the divide by 5 is to give hrtimer 5 chances to
+	 * increment before the hardlockup detector generates
+	 * a warning
+	 */
+	return softlockup_thresh / 5 * NSEC_PER_SEC;
+}
+
+/* Commands for resetting the watchdog */
+static void __touch_watchdog(void)
+{
+	int this_cpu = smp_processor_id();
+
+	__get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+}
+
+void touch_softlockup_watchdog(void)
+{
+	__raw_get_cpu_var(watchdog_touch_ts) = 0;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+
+void touch_all_softlockup_watchdogs(void)
+{
+	int cpu;
+
+	/*
+	 * this is done lockless
+	 * do we care if a 0 races with a timestamp?
+	 * all it means is the softlock check starts one cycle later
+	 */
+	for_each_online_cpu(cpu)
+		per_cpu(watchdog_touch_ts, cpu) = 0;
+}
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+void touch_nmi_watchdog(void)
+{
+	if (watchdog_enabled) {
+		unsigned cpu;
+
+		for_each_present_cpu(cpu) {
+			if (per_cpu(watchdog_nmi_touch, cpu) != true)
+				per_cpu(watchdog_nmi_touch, cpu) = true;
+		}
+	}
+	touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+#endif
+
+void touch_softlockup_watchdog_sync(void)
+{
+	__raw_get_cpu_var(softlockup_touch_sync) = true;
+	__raw_get_cpu_var(watchdog_touch_ts) = 0;
+}
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+/* watchdog detector functions */
+static int is_hardlockup(void)
+{
+	unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
+
+	if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
+		return 1;
+
+	__get_cpu_var(hrtimer_interrupts_saved) = hrint;
+	return 0;
+}
+#endif
+
+static int is_softlockup(unsigned long touch_ts)
+{
+	unsigned long now = get_timestamp(smp_processor_id());
+
+	/* Warn about unreasonable delays: */
+	if (time_after(now, touch_ts + softlockup_thresh))
+		return now - touch_ts;
+
+	return 0;
+}
+
+static int
+watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = watchdog_panic,
+};
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static struct perf_event_attr wd_hw_attr = {
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
+};
+
+/* Callback function for perf event subsystem */
+void watchdog_overflow_callback(struct perf_event *event, int nmi,
+		 struct perf_sample_data *data,
+		 struct pt_regs *regs)
+{
+	/* Ensure the watchdog never gets throttled */
+	event->hw.interrupts = 0;
+
+	if (__get_cpu_var(watchdog_nmi_touch) == true) {
+		__get_cpu_var(watchdog_nmi_touch) = false;
+		return;
+	}
+
+	/* check for a hardlockup
+	 * This is done by making sure our timer interrupt
+	 * is incrementing.  The timer interrupt should have
+	 * fired multiple times before we overflow'd.  If it hasn't
+	 * then this is a good indication the cpu is stuck
+	 */
+	if (is_hardlockup()) {
+		int this_cpu = smp_processor_id();
+
+		/* only print hardlockups once */
+		if (__get_cpu_var(hard_watchdog_warn) == true)
+			return;
+
+		if (hardlockup_panic)
+			panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+		else
+			WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+
+		__get_cpu_var(hard_watchdog_warn) = true;
+		return;
+	}
+
+	__get_cpu_var(hard_watchdog_warn) = false;
+	return;
+}
+static void watchdog_interrupt_count(void)
+{
+	__get_cpu_var(hrtimer_interrupts)++;
+}
+#else
+static inline void watchdog_interrupt_count(void) { return; }
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+/* watchdog kicker functions */
+static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
+{
+	unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
+	struct pt_regs *regs = get_irq_regs();
+	int duration;
+
+	/* kick the hardlockup detector */
+	watchdog_interrupt_count();
+
+	/* kick the softlockup detector */
+	wake_up_process(__get_cpu_var(softlockup_watchdog));
+
+	/* .. and repeat */
+	hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
+
+	if (touch_ts == 0) {
+		if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
+			/*
+			 * If the time stamp was touched atomically
+			 * make sure the scheduler tick is up to date.
+			 */
+			__get_cpu_var(softlockup_touch_sync) = false;
+			sched_clock_tick();
+		}
+		__touch_watchdog();
+		return HRTIMER_RESTART;
+	}
+
+	/* check for a softlockup
+	 * This is done by making sure a high priority task is
+	 * being scheduled.  The task touches the watchdog to
+	 * indicate it is getting cpu time.  If it hasn't then
+	 * this is a good indication some task is hogging the cpu
+	 */
+	duration = is_softlockup(touch_ts);
+	if (unlikely(duration)) {
+		/* only warn once */
+		if (__get_cpu_var(soft_watchdog_warn) == true)
+			return HRTIMER_RESTART;
+
+		printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+			smp_processor_id(), duration,
+			current->comm, task_pid_nr(current));
+		print_modules();
+		print_irqtrace_events(current);
+		if (regs)
+			show_regs(regs);
+		else
+			dump_stack();
+
+		if (softlockup_panic)
+			panic("softlockup: hung tasks");
+		__get_cpu_var(soft_watchdog_warn) = true;
+	} else
+		__get_cpu_var(soft_watchdog_warn) = false;
+
+	return HRTIMER_RESTART;
+}
+
+
+/*
+ * The watchdog thread - touches the timestamp.
+ */
+static int watchdog(void *unused)
+{
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+
+	/* initialize timestamp */
+	__touch_watchdog();
+
+	/* kick off the timer for the hardlockup detector */
+	/* done here because hrtimer_start can only pin to smp_processor_id() */
+	hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
+		      HRTIMER_MODE_REL_PINNED);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	/*
+	 * Run briefly once per second to reset the softlockup timestamp.
+	 * If this gets delayed for more than 60 seconds then the
+	 * debug-printout triggers in watchdog_timer_fn().
+	 */
+	while (!kthread_should_stop()) {
+		__touch_watchdog();
+		schedule();
+
+		if (kthread_should_stop())
+			break;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static int watchdog_nmi_enable(int cpu)
+{
+	struct perf_event_attr *wd_attr;
+	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+	/* is it already setup and enabled? */
+	if (event && event->state > PERF_EVENT_STATE_OFF)
+		goto out;
+
+	/* it is setup but not enabled */
+	if (event != NULL)
+		goto out_enable;
+
+	/* Try to register using hardware perf events */
+	wd_attr = &wd_hw_attr;
+	wd_attr->sample_period = hw_nmi_get_sample_period();
+	event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
+	if (!IS_ERR(event)) {
+		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+		goto out_save;
+	}
+
+	printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+	return -1;
+
+	/* success path */
+out_save:
+	per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+	perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+	return 0;
+}
+
+static void watchdog_nmi_disable(int cpu)
+{
+	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+	if (event) {
+		perf_event_disable(event);
+		per_cpu(watchdog_ev, cpu) = NULL;
+
+		/* should be in cleanup, but blocks oprofile */
+		perf_event_release_kernel(event);
+	}
+	return;
+}
+#else
+static int watchdog_nmi_enable(int cpu) { return 0; }
+static void watchdog_nmi_disable(int cpu) { return; }
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+/* prepare/enable/disable routines */
+static int watchdog_prepare_cpu(int cpu)
+{
+	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+
+	WARN_ON(per_cpu(softlockup_watchdog, cpu));
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = watchdog_timer_fn;
+
+	return 0;
+}
+
+static int watchdog_enable(int cpu)
+{
+	struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+
+	/* enable the perf event */
+	if (watchdog_nmi_enable(cpu) != 0)
+		return -1;
+
+	/* create the watchdog thread */
+	if (!p) {
+		p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+		if (IS_ERR(p)) {
+			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
+			return -1;
+		}
+		kthread_bind(p, cpu);
+		per_cpu(watchdog_touch_ts, cpu) = 0;
+		per_cpu(softlockup_watchdog, cpu) = p;
+		wake_up_process(p);
+	}
+
+	/* if any cpu succeeds, watchdog is considered enabled for the system */
+	watchdog_enabled = 1;
+
+	return 0;
+}
+
+static void watchdog_disable(int cpu)
+{
+	struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+
+	/*
+	 * cancel the timer first to stop incrementing the stats
+	 * and waking up the kthread
+	 */
+	hrtimer_cancel(hrtimer);
+
+	/* disable the perf event */
+	watchdog_nmi_disable(cpu);
+
+	/* stop the watchdog thread */
+	if (p) {
+		per_cpu(softlockup_watchdog, cpu) = NULL;
+		kthread_stop(p);
+	}
+}
+
+static void watchdog_enable_all_cpus(void)
+{
+	int cpu;
+	int result = 0;
+
+	for_each_online_cpu(cpu)
+		result += watchdog_enable(cpu);
+
+	if (result)
+		printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+
+}
+
+static void watchdog_disable_all_cpus(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		watchdog_disable(cpu);
+
+	/* if all watchdogs are disabled, then they are disabled for the system */
+	watchdog_enabled = 0;
+}
+
+
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
+/*
+ * proc handler for /proc/sys/kernel/nmi_watchdog
+ */
+
+int proc_dowatchdog_enabled(struct ctl_table *table, int write,
+		     void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, buffer, length, ppos);
+
+	if (watchdog_enabled)
+		watchdog_enable_all_cpus();
+	else
+		watchdog_disable_all_cpus();
+	return 0;
+}
+
+int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif /* CONFIG_SYSCTL */
+
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __cpuinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (watchdog_prepare_cpu(hotcpu))
+			return NOTIFY_BAD;
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		if (watchdog_enable(hotcpu))
+			return NOTIFY_BAD;
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		watchdog_disable(hotcpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		watchdog_disable(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+static int __init spawn_watchdog_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	int err;
+
+	if (no_watchdog)
+		return 0;
+
+	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	WARN_ON(err == NOTIFY_BAD);
+
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+	return 0;
+}
+early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dee48658805..f77afd93922 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1,19 +1,26 @@
 /*
- * linux/kernel/workqueue.c
+ * kernel/workqueue.c - generic async execution with shared worker pool
  *
- * Generic mechanism for defining kernel helper threads for running
- * arbitrary tasks in process context.
+ * Copyright (C) 2002		Ingo Molnar
  *
- * Started by Ingo Molnar, Copyright (C) 2002
+ *   Derived from the taskqueue/keventd code by:
+ *     David Woodhouse <dwmw2@infradead.org>
+ *     Andrew Morton
+ *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
+ *     Theodore Ts'o <tytso@mit.edu>
  *
- * Derived from the taskqueue/keventd code by:
+ * Made to use alloc_percpu by Christoph Lameter.
  *
- *   David Woodhouse <dwmw2@infradead.org>
- *   Andrew Morton
- *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
- *   Theodore Ts'o <tytso@mit.edu>
+ * Copyright (C) 2010		SUSE Linux Products GmbH
+ * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
  *
- * Made to use alloc_percpu by Christoph Lameter.
+ * This is the generic async execution mechanism.  Work items as are
+ * executed in process context.  The worker pool is shared and
+ * automatically managed.  There is one worker pool for each CPU and
+ * one extra for works which are better served by workers which are
+ * not bound to any specific CPU.
+ *
+ * Please read Documentation/workqueue.txt for details.
  */
 
 #include <linux/module.h>
@@ -33,41 +40,291 @@
 #include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
+#include <linux/idr.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
 
+#include "workqueue_sched.h"
+
+enum {
+	/* global_cwq flags */
+	GCWQ_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
+	GCWQ_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
+	GCWQ_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
+	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
+	GCWQ_HIGHPRI_PENDING	= 1 << 4,	/* highpri works on queue */
+
+	/* worker flags */
+	WORKER_STARTED		= 1 << 0,	/* started */
+	WORKER_DIE		= 1 << 1,	/* die die die */
+	WORKER_IDLE		= 1 << 2,	/* is idle */
+	WORKER_PREP		= 1 << 3,	/* preparing to run works */
+	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
+	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
+	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
+	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
+
+	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
+				  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+
+	/* gcwq->trustee_state */
+	TRUSTEE_START		= 0,		/* start */
+	TRUSTEE_IN_CHARGE	= 1,		/* trustee in charge of gcwq */
+	TRUSTEE_BUTCHER		= 2,		/* butcher workers */
+	TRUSTEE_RELEASE		= 3,		/* release workers */
+	TRUSTEE_DONE		= 4,		/* trustee is done */
+
+	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
+	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
+	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
+
+	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
+	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */
+
+	MAYDAY_INITIAL_TIMEOUT	= HZ / 100,	/* call for help after 10ms */
+	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
+	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */
+	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
+
+	/*
+	 * Rescue workers are used only on emergencies and shared by
+	 * all cpus.  Give -20.
+	 */
+	RESCUER_NICE_LEVEL	= -20,
+};
+
 /*
- * The per-CPU workqueue (if single thread, we always use the first
- * possible cpu).
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only for
+ *    everyone else.
+ *
+ * P: Preemption protected.  Disabling preemption is enough and should
+ *    only be modified and accessed from the local cpu.
+ *
+ * L: gcwq->lock protected.  Access with gcwq->lock held.
+ *
+ * X: During normal operation, modification requires gcwq->lock and
+ *    should be done only from local cpu.  Either disabling preemption
+ *    on local cpu or grabbing gcwq->lock is enough for read access.
+ *    If GCWQ_DISASSOCIATED is set, it's identical to L.
+ *
+ * F: wq->flush_mutex protected.
+ *
+ * W: workqueue_lock protected.
  */
-struct cpu_workqueue_struct {
 
-	spinlock_t lock;
+struct global_cwq;
 
-	struct list_head worklist;
-	wait_queue_head_t more_work;
-	struct work_struct *current_work;
+/*
+ * The poor guys doing the actual heavy lifting.  All on-duty workers
+ * are either serving the manager role, on idle list or on busy hash.
+ */
+struct worker {
+	/* on idle list while idle, on busy hash table while busy */
+	union {
+		struct list_head	entry;	/* L: while idle */
+		struct hlist_node	hentry;	/* L: while busy */
+	};
 
-	struct workqueue_struct *wq;
-	struct task_struct *thread;
-} ____cacheline_aligned;
+	struct work_struct	*current_work;	/* L: work being processed */
+	struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
+	struct list_head	scheduled;	/* L: scheduled works */
+	struct task_struct	*task;		/* I: worker task */
+	struct global_cwq	*gcwq;		/* I: the associated gcwq */
+	/* 64 bytes boundary on 64bit, 32 on 32bit */
+	unsigned long		last_active;	/* L: last active timestamp */
+	unsigned int		flags;		/* X: flags */
+	int			id;		/* I: worker id */
+	struct work_struct	rebind_work;	/* L: rebind worker to cpu */
+};
+
+/*
+ * Global per-cpu workqueue.  There's one and only one for each cpu
+ * and all works are queued and processed here regardless of their
+ * target workqueues.
+ */
+struct global_cwq {
+	spinlock_t		lock;		/* the gcwq lock */
+	struct list_head	worklist;	/* L: list of pending works */
+	unsigned int		cpu;		/* I: the associated cpu */
+	unsigned int		flags;		/* L: GCWQ_* flags */
+
+	int			nr_workers;	/* L: total number of workers */
+	int			nr_idle;	/* L: currently idle ones */
+
+	/* workers are chained either in the idle_list or busy_hash */
+	struct list_head	idle_list;	/* X: list of idle workers */
+	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
+						/* L: hash of busy workers */
+
+	struct timer_list	idle_timer;	/* L: worker idle timeout */
+	struct timer_list	mayday_timer;	/* L: SOS timer for dworkers */
+
+	struct ida		worker_ida;	/* L: for worker IDs */
+
+	struct task_struct	*trustee;	/* L: for gcwq shutdown */
+	unsigned int		trustee_state;	/* L: trustee state */
+	wait_queue_head_t	trustee_wait;	/* trustee wait */
+	struct worker		*first_idle;	/* L: first idle worker */
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
+ * work_struct->data are used for flags and thus cwqs need to be
+ * aligned at two's power of the number of flag bits.
+ */
+struct cpu_workqueue_struct {
+	struct global_cwq	*gcwq;		/* I: the associated gcwq */
+	struct workqueue_struct *wq;		/* I: the owning workqueue */
+	int			work_color;	/* L: current color */
+	int			flush_color;	/* L: flushing color */
+	int			nr_in_flight[WORK_NR_COLORS];
+						/* L: nr of in_flight works */
+	int			nr_active;	/* L: nr of active works */
+	int			max_active;	/* L: max active works */
+	struct list_head	delayed_works;	/* L: delayed works */
+};
+
+/*
+ * Structure used to wait for workqueue flush.
+ */
+struct wq_flusher {
+	struct list_head	list;		/* F: list of flushers */
+	int			flush_color;	/* F: flush color waiting for */
+	struct completion	done;		/* flush completion */
+};
+
+/*
+ * All cpumasks are assumed to be always set on UP and thus can't be
+ * used to determine whether there's something to be done.
+ */
+#ifdef CONFIG_SMP
+typedef cpumask_var_t mayday_mask_t;
+#define mayday_test_and_set_cpu(cpu, mask)	\
+	cpumask_test_and_set_cpu((cpu), (mask))
+#define mayday_clear_cpu(cpu, mask)		cpumask_clear_cpu((cpu), (mask))
+#define for_each_mayday_cpu(cpu, mask)		for_each_cpu((cpu), (mask))
+#define alloc_mayday_mask(maskp, gfp)		zalloc_cpumask_var((maskp), (gfp))
+#define free_mayday_mask(mask)			free_cpumask_var((mask))
+#else
+typedef unsigned long mayday_mask_t;
+#define mayday_test_and_set_cpu(cpu, mask)	test_and_set_bit(0, &(mask))
+#define mayday_clear_cpu(cpu, mask)		clear_bit(0, &(mask))
+#define for_each_mayday_cpu(cpu, mask)		if ((cpu) = 0, (mask))
+#define alloc_mayday_mask(maskp, gfp)		true
+#define free_mayday_mask(mask)			do { } while (0)
+#endif
 
 /*
  * The externally visible workqueue abstraction is an array of
  * per-CPU workqueues:
  */
 struct workqueue_struct {
-	struct cpu_workqueue_struct *cpu_wq;
-	struct list_head list;
-	const char *name;
-	int singlethread;
-	int freezeable;		/* Freeze threads during suspend */
-	int rt;
+	unsigned int		flags;		/* I: WQ_* flags */
+	union {
+		struct cpu_workqueue_struct __percpu	*pcpu;
+		struct cpu_workqueue_struct		*single;
+		unsigned long				v;
+	} cpu_wq;				/* I: cwq's */
+	struct list_head	list;		/* W: list of all workqueues */
+
+	struct mutex		flush_mutex;	/* protects wq flushing */
+	int			work_color;	/* F: current work color */
+	int			flush_color;	/* F: current flush color */
+	atomic_t		nr_cwqs_to_flush; /* flush in progress */
+	struct wq_flusher	*first_flusher;	/* F: first flusher */
+	struct list_head	flusher_queue;	/* F: flush waiters */
+	struct list_head	flusher_overflow; /* F: flush overflow list */
+
+	mayday_mask_t		mayday_mask;	/* cpus requesting rescue */
+	struct worker		*rescuer;	/* I: rescue worker */
+
+	int			saved_max_active; /* W: saved cwq max_active */
+	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
-	struct lockdep_map lockdep_map;
+	struct lockdep_map	lockdep_map;
 #endif
 };
 
+struct workqueue_struct *system_wq __read_mostly;
+struct workqueue_struct *system_long_wq __read_mostly;
+struct workqueue_struct *system_nrt_wq __read_mostly;
+struct workqueue_struct *system_unbound_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_wq);
+EXPORT_SYMBOL_GPL(system_long_wq);
+EXPORT_SYMBOL_GPL(system_nrt_wq);
+EXPORT_SYMBOL_GPL(system_unbound_wq);
+
+#define for_each_busy_worker(worker, i, pos, gcwq)			\
+	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
+		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
+
+static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
+				  unsigned int sw)
+{
+	if (cpu < nr_cpu_ids) {
+		if (sw & 1) {
+			cpu = cpumask_next(cpu, mask);
+			if (cpu < nr_cpu_ids)
+				return cpu;
+		}
+		if (sw & 2)
+			return WORK_CPU_UNBOUND;
+	}
+	return WORK_CPU_NONE;
+}
+
+static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
+				struct workqueue_struct *wq)
+{
+	return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
+}
+
+/*
+ * CPU iterators
+ *
+ * An extra gcwq is defined for an invalid cpu number
+ * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
+ * specific CPU.  The following iterators are similar to
+ * for_each_*_cpu() iterators but also considers the unbound gcwq.
+ *
+ * for_each_gcwq_cpu()		: possible CPUs + WORK_CPU_UNBOUND
+ * for_each_online_gcwq_cpu()	: online CPUs + WORK_CPU_UNBOUND
+ * for_each_cwq_cpu()		: possible CPUs for bound workqueues,
+ *				  WORK_CPU_UNBOUND for unbound workqueues
+ */
+#define for_each_gcwq_cpu(cpu)						\
+	for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);		\
+	     (cpu) < WORK_CPU_NONE;					\
+	     (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
+
+#define for_each_online_gcwq_cpu(cpu)					\
+	for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);		\
+	     (cpu) < WORK_CPU_NONE;					\
+	     (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
+
+#define for_each_cwq_cpu(cpu, wq)					\
+	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));	\
+	     (cpu) < WORK_CPU_NONE;					\
+	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
+
+#ifdef CONFIG_LOCKDEP
+/**
+ * in_workqueue_context() - in context of specified workqueue?
+ * @wq: the workqueue of interest
+ *
+ * Checks lockdep state to see if the current task is executing from
+ * within a workqueue item.  This function exists only if lockdep is
+ * enabled.
+ */
+int in_workqueue_context(struct workqueue_struct *wq)
+{
+	return lock_is_held(&wq->lockdep_map);
+}
+#endif
+
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
 static struct debug_obj_descr work_debug_descr;
@@ -107,7 +364,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
 		 * statically initialized. We just make sure that it
 		 * is tracked in the object tracker.
 		 */
-		if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
+		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
 			debug_object_init(work, &work_debug_descr);
 			debug_object_activate(work, &work_debug_descr);
 			return 0;
@@ -181,84 +438,582 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
 /* Serializes the accesses to the list of workqueues. */
 static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
+static bool workqueue_freezing;		/* W: have wqs started freezing? */
 
-static int singlethread_cpu __read_mostly;
-static const struct cpumask *cpu_singlethread_map __read_mostly;
 /*
- * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD
- * flushes cwq->worklist. This means that flush_workqueue/wait_on_work
- * which comes in between can't use for_each_online_cpu(). We could
- * use cpu_possible_map, the cpumask below is more a documentation
- * than optimization.
+ * The almighty global cpu workqueues.  nr_running is the only field
+ * which is expected to be used frequently by other cpus via
+ * try_to_wake_up().  Put it in a separate cacheline.
  */
-static cpumask_var_t cpu_populated_map __read_mostly;
+static DEFINE_PER_CPU(struct global_cwq, global_cwq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
+
+/*
+ * Global cpu workqueue and nr_running counter for unbound gcwq.  The
+ * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
+ * workers have WORKER_UNBOUND set.
+ */
+static struct global_cwq unbound_global_cwq;
+static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);	/* always 0 */
+
+static int worker_thread(void *__worker);
+
+static struct global_cwq *get_gcwq(unsigned int cpu)
+{
+	if (cpu != WORK_CPU_UNBOUND)
+		return &per_cpu(global_cwq, cpu);
+	else
+		return &unbound_global_cwq;
+}
+
+static atomic_t *get_gcwq_nr_running(unsigned int cpu)
+{
+	if (cpu != WORK_CPU_UNBOUND)
+		return &per_cpu(gcwq_nr_running, cpu);
+	else
+		return &unbound_gcwq_nr_running;
+}
 
-/* If it's single threaded, it isn't in the list of workqueues. */
-static inline int is_wq_single_threaded(struct workqueue_struct *wq)
+static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
+					    struct workqueue_struct *wq)
 {
-	return wq->singlethread;
+	if (!(wq->flags & WQ_UNBOUND)) {
+		if (likely(cpu < nr_cpu_ids)) {
+#ifdef CONFIG_SMP
+			return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+#else
+			return wq->cpu_wq.single;
+#endif
+		}
+	} else if (likely(cpu == WORK_CPU_UNBOUND))
+		return wq->cpu_wq.single;
+	return NULL;
 }
 
-static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
+static unsigned int work_color_to_flags(int color)
 {
-	return is_wq_single_threaded(wq)
-		? cpu_singlethread_map : cpu_populated_map;
+	return color << WORK_STRUCT_COLOR_SHIFT;
 }
 
-static
-struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
+static int get_work_color(struct work_struct *work)
 {
-	if (unlikely(is_wq_single_threaded(wq)))
-		cpu = singlethread_cpu;
-	return per_cpu_ptr(wq->cpu_wq, cpu);
+	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
+		((1 << WORK_STRUCT_COLOR_BITS) - 1);
+}
+
+static int work_next_color(int color)
+{
+	return (color + 1) % WORK_NR_COLORS;
 }
 
 /*
- * Set the workqueue on which a work item is to be run
- * - Must *only* be called if the pending flag is set
+ * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
+ * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
+ * cleared and the work data contains the cpu number it was last on.
+ *
+ * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
+ * cwq, cpu or clear work->data.  These functions should only be
+ * called while the work is owned - ie. while the PENDING bit is set.
+ *
+ * get_work_[g]cwq() can be used to obtain the gcwq or cwq
+ * corresponding to a work.  gcwq is available once the work has been
+ * queued anywhere after initialization.  cwq is available only from
+ * queueing until execution starts.
  */
-static inline void set_wq_data(struct work_struct *work,
-				struct cpu_workqueue_struct *cwq)
+static inline void set_work_data(struct work_struct *work, unsigned long data,
+				 unsigned long flags)
 {
-	unsigned long new;
-
 	BUG_ON(!work_pending(work));
+	atomic_long_set(&work->data, data | flags | work_static(work));
+}
+
+static void set_work_cwq(struct work_struct *work,
+			 struct cpu_workqueue_struct *cwq,
+			 unsigned long extra_flags)
+{
+	set_work_data(work, (unsigned long)cwq,
+		      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
+}
+
+static void set_work_cpu(struct work_struct *work, unsigned int cpu)
+{
+	set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
+}
+
+static void clear_work_data(struct work_struct *work)
+{
+	set_work_data(work, WORK_STRUCT_NO_CPU, 0);
+}
+
+static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
+{
+	unsigned long data = atomic_long_read(&work->data);
+
+	if (data & WORK_STRUCT_CWQ)
+		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
+	else
+		return NULL;
+}
+
+static struct global_cwq *get_work_gcwq(struct work_struct *work)
+{
+	unsigned long data = atomic_long_read(&work->data);
+	unsigned int cpu;
+
+	if (data & WORK_STRUCT_CWQ)
+		return ((struct cpu_workqueue_struct *)
+			(data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
+
+	cpu = data >> WORK_STRUCT_FLAG_BITS;
+	if (cpu == WORK_CPU_NONE)
+		return NULL;
 
-	new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
-	new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
-	atomic_long_set(&work->data, new);
+	BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
+	return get_gcwq(cpu);
+}
+
+/*
+ * Policy functions.  These define the policies on how the global
+ * worker pool is managed.  Unless noted otherwise, these functions
+ * assume that they're being called with gcwq->lock held.
+ */
+
+static bool __need_more_worker(struct global_cwq *gcwq)
+{
+	return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
+		gcwq->flags & GCWQ_HIGHPRI_PENDING;
+}
+
+/*
+ * Need to wake up a worker?  Called from anything but currently
+ * running workers.
+ */
+static bool need_more_worker(struct global_cwq *gcwq)
+{
+	return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
 }
 
-static inline
-struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
+/* Can I start working?  Called from busy but !running workers. */
+static bool may_start_working(struct global_cwq *gcwq)
 {
-	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
+	return gcwq->nr_idle;
 }
 
+/* Do I need to keep working?  Called from currently running workers. */
+static bool keep_working(struct global_cwq *gcwq)
+{
+	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+
+	return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1;
+}
+
+/* Do we need a new worker?  Called from manager. */
+static bool need_to_create_worker(struct global_cwq *gcwq)
+{
+	return need_more_worker(gcwq) && !may_start_working(gcwq);
+}
+
+/* Do I need to be the manager? */
+static bool need_to_manage_workers(struct global_cwq *gcwq)
+{
+	return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+}
+
+/* Do we have too many workers and should some go away? */
+static bool too_many_workers(struct global_cwq *gcwq)
+{
+	bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
+	int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
+	int nr_busy = gcwq->nr_workers - nr_idle;
+
+	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
+}
+
+/*
+ * Wake up functions.
+ */
+
+/* Return the first worker.  Safe with preemption disabled */
+static struct worker *first_worker(struct global_cwq *gcwq)
+{
+	if (unlikely(list_empty(&gcwq->idle_list)))
+		return NULL;
+
+	return list_first_entry(&gcwq->idle_list, struct worker, entry);
+}
+
+/**
+ * wake_up_worker - wake up an idle worker
+ * @gcwq: gcwq to wake worker for
+ *
+ * Wake up the first idle worker of @gcwq.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void wake_up_worker(struct global_cwq *gcwq)
+{
+	struct worker *worker = first_worker(gcwq);
+
+	if (likely(worker))
+		wake_up_process(worker->task);
+}
+
+/**
+ * wq_worker_waking_up - a worker is waking up
+ * @task: task waking up
+ * @cpu: CPU @task is waking up to
+ *
+ * This function is called during try_to_wake_up() when a worker is
+ * being awoken.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
+{
+	struct worker *worker = kthread_data(task);
+
+	if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
+		atomic_inc(get_gcwq_nr_running(cpu));
+}
+
+/**
+ * wq_worker_sleeping - a worker is going to sleep
+ * @task: task going to sleep
+ * @cpu: CPU in question, must be the current CPU number
+ *
+ * This function is called during schedule() when a busy worker is
+ * going to sleep.  Worker on the same cpu can be woken up by
+ * returning pointer to its task.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ *
+ * RETURNS:
+ * Worker task on @cpu to wake up, %NULL if none.
+ */
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+				       unsigned int cpu)
+{
+	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+	struct global_cwq *gcwq = get_gcwq(cpu);
+	atomic_t *nr_running = get_gcwq_nr_running(cpu);
+
+	if (unlikely(worker->flags & WORKER_NOT_RUNNING))
+		return NULL;
+
+	/* this can only happen on the local cpu */
+	BUG_ON(cpu != raw_smp_processor_id());
+
+	/*
+	 * The counterpart of the following dec_and_test, implied mb,
+	 * worklist not empty test sequence is in insert_work().
+	 * Please read comment there.
+	 *
+	 * NOT_RUNNING is clear.  This means that trustee is not in
+	 * charge and we're running on the local cpu w/ rq lock held
+	 * and preemption disabled, which in turn means that none else
+	 * could be manipulating idle_list, so dereferencing idle_list
+	 * without gcwq lock is safe.
+	 */
+	if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
+		to_wakeup = first_worker(gcwq);
+	return to_wakeup ? to_wakeup->task : NULL;
+}
+
+/**
+ * worker_set_flags - set worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to set
+ * @wakeup: wakeup an idle worker if necessary
+ *
+ * Set @flags in @worker->flags and adjust nr_running accordingly.  If
+ * nr_running becomes zero and @wakeup is %true, an idle worker is
+ * woken up.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock)
+ */
+static inline void worker_set_flags(struct worker *worker, unsigned int flags,
+				    bool wakeup)
+{
+	struct global_cwq *gcwq = worker->gcwq;
+
+	WARN_ON_ONCE(worker->task != current);
+
+	/*
+	 * If transitioning into NOT_RUNNING, adjust nr_running and
+	 * wake up an idle worker as necessary if requested by
+	 * @wakeup.
+	 */
+	if ((flags & WORKER_NOT_RUNNING) &&
+	    !(worker->flags & WORKER_NOT_RUNNING)) {
+		atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+
+		if (wakeup) {
+			if (atomic_dec_and_test(nr_running) &&
+			    !list_empty(&gcwq->worklist))
+				wake_up_worker(gcwq);
+		} else
+			atomic_dec(nr_running);
+	}
+
+	worker->flags |= flags;
+}
+
+/**
+ * worker_clr_flags - clear worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to clear
+ *
+ * Clear @flags in @worker->flags and adjust nr_running accordingly.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock)
+ */
+static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
+{
+	struct global_cwq *gcwq = worker->gcwq;
+	unsigned int oflags = worker->flags;
+
+	WARN_ON_ONCE(worker->task != current);
+
+	worker->flags &= ~flags;
+
+	/* if transitioning out of NOT_RUNNING, increment nr_running */
+	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
+		if (!(worker->flags & WORKER_NOT_RUNNING))
+			atomic_inc(get_gcwq_nr_running(gcwq->cpu));
+}
+
+/**
+ * busy_worker_head - return the busy hash head for a work
+ * @gcwq: gcwq of interest
+ * @work: work to be hashed
+ *
+ * Return hash head of @gcwq for @work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to the hash head.
+ */
+static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
+					   struct work_struct *work)
+{
+	const int base_shift = ilog2(sizeof(struct work_struct));
+	unsigned long v = (unsigned long)work;
+
+	/* simple shift and fold hash, do we need something better? */
+	v >>= base_shift;
+	v += v >> BUSY_WORKER_HASH_ORDER;
+	v &= BUSY_WORKER_HASH_MASK;
+
+	return &gcwq->busy_hash[v];
+}
+
+/**
+ * __find_worker_executing_work - find worker which is executing a work
+ * @gcwq: gcwq of interest
+ * @bwh: hash head as returned by busy_worker_head()
+ * @work: work to find worker for
+ *
+ * Find a worker which is executing @work on @gcwq.  @bwh should be
+ * the hash head obtained by calling busy_worker_head() with the same
+ * work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to worker which is executing @work if found, NULL
+ * otherwise.
+ */
+static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
+						   struct hlist_head *bwh,
+						   struct work_struct *work)
+{
+	struct worker *worker;
+	struct hlist_node *tmp;
+
+	hlist_for_each_entry(worker, tmp, bwh, hentry)
+		if (worker->current_work == work)
+			return worker;
+	return NULL;
+}
+
+/**
+ * find_worker_executing_work - find worker which is executing a work
+ * @gcwq: gcwq of interest
+ * @work: work to find worker for
+ *
+ * Find a worker which is executing @work on @gcwq.  This function is
+ * identical to __find_worker_executing_work() except that this
+ * function calculates @bwh itself.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to worker which is executing @work if found, NULL
+ * otherwise.
+ */
+static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
+						 struct work_struct *work)
+{
+	return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
+					    work);
+}
+
+/**
+ * gcwq_determine_ins_pos - find insertion position
+ * @gcwq: gcwq of interest
+ * @cwq: cwq a work is being queued for
+ *
+ * A work for @cwq is about to be queued on @gcwq, determine insertion
+ * position for the work.  If @cwq is for HIGHPRI wq, the work is
+ * queued at the head of the queue but in FIFO order with respect to
+ * other HIGHPRI works; otherwise, at the end of the queue.  This
+ * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
+ * there are HIGHPRI works pending.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to inserstion position.
+ */
+static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
+					       struct cpu_workqueue_struct *cwq)
+{
+	struct work_struct *twork;
+
+	if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
+		return &gcwq->worklist;
+
+	list_for_each_entry(twork, &gcwq->worklist, entry) {
+		struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
+
+		if (!(tcwq->wq->flags & WQ_HIGHPRI))
+			break;
+	}
+
+	gcwq->flags |= GCWQ_HIGHPRI_PENDING;
+	return &twork->entry;
+}
+
+/**
+ * insert_work - insert a work into gcwq
+ * @cwq: cwq @work belongs to
+ * @work: work to insert
+ * @head: insertion point
+ * @extra_flags: extra WORK_STRUCT_* flags to set
+ *
+ * Insert @work which belongs to @cwq into @gcwq after @head.
+ * @extra_flags is or'd to work_struct flags.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
 static void insert_work(struct cpu_workqueue_struct *cwq,
-			struct work_struct *work, struct list_head *head)
+			struct work_struct *work, struct list_head *head,
+			unsigned int extra_flags)
 {
-	trace_workqueue_insertion(cwq->thread, work);
+	struct global_cwq *gcwq = cwq->gcwq;
+
+	/* we own @work, set data and link */
+	set_work_cwq(work, cwq, extra_flags);
 
-	set_wq_data(work, cwq);
 	/*
 	 * Ensure that we get the right work->data if we see the
 	 * result of list_add() below, see try_to_grab_pending().
 	 */
 	smp_wmb();
+
 	list_add_tail(&work->entry, head);
-	wake_up(&cwq->more_work);
+
+	/*
+	 * Ensure either worker_sched_deactivated() sees the above
+	 * list_add_tail() or we see zero nr_running to avoid workers
+	 * lying around lazily while there are works to be processed.
+	 */
+	smp_mb();
+
+	if (__need_more_worker(gcwq))
+		wake_up_worker(gcwq);
 }
 
-static void __queue_work(struct cpu_workqueue_struct *cwq,
+static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
+	struct global_cwq *gcwq;
+	struct cpu_workqueue_struct *cwq;
+	struct list_head *worklist;
+	unsigned int work_flags;
 	unsigned long flags;
 
 	debug_work_activate(work);
-	spin_lock_irqsave(&cwq->lock, flags);
-	insert_work(cwq, work, &cwq->worklist);
-	spin_unlock_irqrestore(&cwq->lock, flags);
+
+	if (WARN_ON_ONCE(wq->flags & WQ_DYING))
+		return;
+
+	/* determine gcwq to use */
+	if (!(wq->flags & WQ_UNBOUND)) {
+		struct global_cwq *last_gcwq;
+
+		if (unlikely(cpu == WORK_CPU_UNBOUND))
+			cpu = raw_smp_processor_id();
+
+		/*
+		 * It's multi cpu.  If @wq is non-reentrant and @work
+		 * was previously on a different cpu, it might still
+		 * be running there, in which case the work needs to
+		 * be queued on that cpu to guarantee non-reentrance.
+		 */
+		gcwq = get_gcwq(cpu);
+		if (wq->flags & WQ_NON_REENTRANT &&
+		    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
+			struct worker *worker;
+
+			spin_lock_irqsave(&last_gcwq->lock, flags);
+
+			worker = find_worker_executing_work(last_gcwq, work);
+
+			if (worker && worker->current_cwq->wq == wq)
+				gcwq = last_gcwq;
+			else {
+				/* meh... not running there, queue here */
+				spin_unlock_irqrestore(&last_gcwq->lock, flags);
+				spin_lock_irqsave(&gcwq->lock, flags);
+			}
+		} else
+			spin_lock_irqsave(&gcwq->lock, flags);
+	} else {
+		gcwq = get_gcwq(WORK_CPU_UNBOUND);
+		spin_lock_irqsave(&gcwq->lock, flags);
+	}
+
+	/* gcwq determined, get cwq and queue */
+	cwq = get_cwq(gcwq->cpu, wq);
+
+	BUG_ON(!list_empty(&work->entry));
+
+	cwq->nr_in_flight[cwq->work_color]++;
+	work_flags = work_color_to_flags(cwq->work_color);
+
+	if (likely(cwq->nr_active < cwq->max_active)) {
+		cwq->nr_active++;
+		worklist = gcwq_determine_ins_pos(gcwq, cwq);
+	} else {
+		work_flags |= WORK_STRUCT_DELAYED;
+		worklist = &cwq->delayed_works;
+	}
+
+	insert_work(cwq, work, worklist, work_flags);
+
+	spin_unlock_irqrestore(&gcwq->lock, flags);
 }
 
 /**
@@ -298,9 +1053,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
 {
 	int ret = 0;
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
-		BUG_ON(!list_empty(&work->entry));
-		__queue_work(wq_per_cpu(wq, cpu), work);
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+		__queue_work(cpu, wq, work);
 		ret = 1;
 	}
 	return ret;
@@ -310,10 +1064,9 @@ EXPORT_SYMBOL_GPL(queue_work_on);
 static void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
-	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
-	struct workqueue_struct *wq = cwq->wq;
+	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
 
-	__queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
+	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
 }
 
 /**
@@ -350,14 +1103,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct timer_list *timer = &dwork->timer;
 	struct work_struct *work = &dwork->work;
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+		unsigned int lcpu;
+
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
 		timer_stats_timer_set_start_info(&dwork->timer);
 
-		/* This stores cwq for the moment, for the timer_fn */
-		set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
+		/*
+		 * This stores cwq for the moment, for the timer_fn.
+		 * Note that the work's gcwq is preserved to allow
+		 * reentrance detection for delayed works.
+		 */
+		if (!(wq->flags & WQ_UNBOUND)) {
+			struct global_cwq *gcwq = get_work_gcwq(work);
+
+			if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
+				lcpu = gcwq->cpu;
+			else
+				lcpu = raw_smp_processor_id();
+		} else
+			lcpu = WORK_CPU_UNBOUND;
+
+		set_work_cwq(work, get_cwq(lcpu, wq), 0);
+
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -372,80 +1142,888 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
-static void run_workqueue(struct cpu_workqueue_struct *cwq)
+/**
+ * worker_enter_idle - enter idle state
+ * @worker: worker which is entering idle state
+ *
+ * @worker is entering idle state.  Update stats and idle timer if
+ * necessary.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void worker_enter_idle(struct worker *worker)
 {
-	spin_lock_irq(&cwq->lock);
-	while (!list_empty(&cwq->worklist)) {
-		struct work_struct *work = list_entry(cwq->worklist.next,
-						struct work_struct, entry);
-		work_func_t f = work->func;
-#ifdef CONFIG_LOCKDEP
+	struct global_cwq *gcwq = worker->gcwq;
+
+	BUG_ON(worker->flags & WORKER_IDLE);
+	BUG_ON(!list_empty(&worker->entry) &&
+	       (worker->hentry.next || worker->hentry.pprev));
+
+	/* can't use worker_set_flags(), also called from start_worker() */
+	worker->flags |= WORKER_IDLE;
+	gcwq->nr_idle++;
+	worker->last_active = jiffies;
+
+	/* idle_list is LIFO */
+	list_add(&worker->entry, &gcwq->idle_list);
+
+	if (likely(!(worker->flags & WORKER_ROGUE))) {
+		if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
+			mod_timer(&gcwq->idle_timer,
+				  jiffies + IDLE_WORKER_TIMEOUT);
+	} else
+		wake_up_all(&gcwq->trustee_wait);
+
+	/* sanity check nr_running */
+	WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
+		     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
+}
+
+/**
+ * worker_leave_idle - leave idle state
+ * @worker: worker which is leaving idle state
+ *
+ * @worker is leaving idle state.  Update stats.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void worker_leave_idle(struct worker *worker)
+{
+	struct global_cwq *gcwq = worker->gcwq;
+
+	BUG_ON(!(worker->flags & WORKER_IDLE));
+	worker_clr_flags(worker, WORKER_IDLE);
+	gcwq->nr_idle--;
+	list_del_init(&worker->entry);
+}
+
+/**
+ * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
+ * @worker: self
+ *
+ * Works which are scheduled while the cpu is online must at least be
+ * scheduled to a worker which is bound to the cpu so that if they are
+ * flushed from cpu callbacks while cpu is going down, they are
+ * guaranteed to execute on the cpu.
+ *
+ * This function is to be used by rogue workers and rescuers to bind
+ * themselves to the target cpu and may race with cpu going down or
+ * coming online.  kthread_bind() can't be used because it may put the
+ * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
+ * verbatim as it's best effort and blocking and gcwq may be
+ * [dis]associated in the meantime.
+ *
+ * This function tries set_cpus_allowed() and locks gcwq and verifies
+ * the binding against GCWQ_DISASSOCIATED which is set during
+ * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
+ * idle state or fetches works without dropping lock, it can guarantee
+ * the scheduling requirement described in the first paragraph.
+ *
+ * CONTEXT:
+ * Might sleep.  Called without any lock but returns with gcwq->lock
+ * held.
+ *
+ * RETURNS:
+ * %true if the associated gcwq is online (@worker is successfully
+ * bound), %false if offline.
+ */
+static bool worker_maybe_bind_and_lock(struct worker *worker)
+__acquires(&gcwq->lock)
+{
+	struct global_cwq *gcwq = worker->gcwq;
+	struct task_struct *task = worker->task;
+
+	while (true) {
 		/*
-		 * It is permissible to free the struct work_struct
-		 * from inside the function that is called from it,
-		 * this we need to take into account for lockdep too.
-		 * To avoid bogus "held lock freed" warnings as well
-		 * as problems when looking into work->lockdep_map,
-		 * make a copy and use that here.
+		 * The following call may fail, succeed or succeed
+		 * without actually migrating the task to the cpu if
+		 * it races with cpu hotunplug operation.  Verify
+		 * against GCWQ_DISASSOCIATED.
 		 */
-		struct lockdep_map lockdep_map = work->lockdep_map;
-#endif
-		trace_workqueue_execution(cwq->thread, work);
-		debug_work_deactivate(work);
-		cwq->current_work = work;
-		list_del_init(cwq->worklist.next);
-		spin_unlock_irq(&cwq->lock);
-
-		BUG_ON(get_wq_data(work) != cwq);
-		work_clear_pending(work);
-		lock_map_acquire(&cwq->wq->lockdep_map);
-		lock_map_acquire(&lockdep_map);
-		f(work);
-		lock_map_release(&lockdep_map);
-		lock_map_release(&cwq->wq->lockdep_map);
-
-		if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
-			printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
-					"%s/0x%08x/%d\n",
-					current->comm, preempt_count(),
-				       	task_pid_nr(current));
-			printk(KERN_ERR "    last function: ");
-			print_symbol("%s\n", (unsigned long)f);
-			debug_show_held_locks(current);
-			dump_stack();
+		if (!(gcwq->flags & GCWQ_DISASSOCIATED))
+			set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
+
+		spin_lock_irq(&gcwq->lock);
+		if (gcwq->flags & GCWQ_DISASSOCIATED)
+			return false;
+		if (task_cpu(task) == gcwq->cpu &&
+		    cpumask_equal(&current->cpus_allowed,
+				  get_cpu_mask(gcwq->cpu)))
+			return true;
+		spin_unlock_irq(&gcwq->lock);
+
+		/* CPU has come up inbetween, retry migration */
+		cpu_relax();
+	}
+}
+
+/*
+ * Function for worker->rebind_work used to rebind rogue busy workers
+ * to the associated cpu which is coming back online.  This is
+ * scheduled by cpu up but can race with other cpu hotplug operations
+ * and may be executed twice without intervening cpu down.
+ */
+static void worker_rebind_fn(struct work_struct *work)
+{
+	struct worker *worker = container_of(work, struct worker, rebind_work);
+	struct global_cwq *gcwq = worker->gcwq;
+
+	if (worker_maybe_bind_and_lock(worker))
+		worker_clr_flags(worker, WORKER_REBIND);
+
+	spin_unlock_irq(&gcwq->lock);
+}
+
+static struct worker *alloc_worker(void)
+{
+	struct worker *worker;
+
+	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+	if (worker) {
+		INIT_LIST_HEAD(&worker->entry);
+		INIT_LIST_HEAD(&worker->scheduled);
+		INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+		/* on creation a worker is in !idle && prep state */
+		worker->flags = WORKER_PREP;
+	}
+	return worker;
+}
+
+/**
+ * create_worker - create a new workqueue worker
+ * @gcwq: gcwq the new worker will belong to
+ * @bind: whether to set affinity to @cpu or not
+ *
+ * Create a new worker which is bound to @gcwq.  The returned worker
+ * can be started by calling start_worker() or destroyed using
+ * destroy_worker().
+ *
+ * CONTEXT:
+ * Might sleep.  Does GFP_KERNEL allocations.
+ *
+ * RETURNS:
+ * Pointer to the newly created worker.
+ */
+static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
+{
+	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+	struct worker *worker = NULL;
+	int id = -1;
+
+	spin_lock_irq(&gcwq->lock);
+	while (ida_get_new(&gcwq->worker_ida, &id)) {
+		spin_unlock_irq(&gcwq->lock);
+		if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
+			goto fail;
+		spin_lock_irq(&gcwq->lock);
+	}
+	spin_unlock_irq(&gcwq->lock);
+
+	worker = alloc_worker();
+	if (!worker)
+		goto fail;
+
+	worker->gcwq = gcwq;
+	worker->id = id;
+
+	if (!on_unbound_cpu)
+		worker->task = kthread_create(worker_thread, worker,
+					      "kworker/%u:%d", gcwq->cpu, id);
+	else
+		worker->task = kthread_create(worker_thread, worker,
+					      "kworker/u:%d", id);
+	if (IS_ERR(worker->task))
+		goto fail;
+
+	/*
+	 * A rogue worker will become a regular one if CPU comes
+	 * online later on.  Make sure every worker has
+	 * PF_THREAD_BOUND set.
+	 */
+	if (bind && !on_unbound_cpu)
+		kthread_bind(worker->task, gcwq->cpu);
+	else {
+		worker->task->flags |= PF_THREAD_BOUND;
+		if (on_unbound_cpu)
+			worker->flags |= WORKER_UNBOUND;
+	}
+
+	return worker;
+fail:
+	if (id >= 0) {
+		spin_lock_irq(&gcwq->lock);
+		ida_remove(&gcwq->worker_ida, id);
+		spin_unlock_irq(&gcwq->lock);
+	}
+	kfree(worker);
+	return NULL;
+}
+
+/**
+ * start_worker - start a newly created worker
+ * @worker: worker to start
+ *
+ * Make the gcwq aware of @worker and start it.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void start_worker(struct worker *worker)
+{
+	worker->flags |= WORKER_STARTED;
+	worker->gcwq->nr_workers++;
+	worker_enter_idle(worker);
+	wake_up_process(worker->task);
+}
+
+/**
+ * destroy_worker - destroy a workqueue worker
+ * @worker: worker to be destroyed
+ *
+ * Destroy @worker and adjust @gcwq stats accordingly.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ */
+static void destroy_worker(struct worker *worker)
+{
+	struct global_cwq *gcwq = worker->gcwq;
+	int id = worker->id;
+
+	/* sanity check frenzy */
+	BUG_ON(worker->current_work);
+	BUG_ON(!list_empty(&worker->scheduled));
+
+	if (worker->flags & WORKER_STARTED)
+		gcwq->nr_workers--;
+	if (worker->flags & WORKER_IDLE)
+		gcwq->nr_idle--;
+
+	list_del_init(&worker->entry);
+	worker->flags |= WORKER_DIE;
+
+	spin_unlock_irq(&gcwq->lock);
+
+	kthread_stop(worker->task);
+	kfree(worker);
+
+	spin_lock_irq(&gcwq->lock);
+	ida_remove(&gcwq->worker_ida, id);
+}
+
+static void idle_worker_timeout(unsigned long __gcwq)
+{
+	struct global_cwq *gcwq = (void *)__gcwq;
+
+	spin_lock_irq(&gcwq->lock);
+
+	if (too_many_workers(gcwq)) {
+		struct worker *worker;
+		unsigned long expires;
+
+		/* idle_list is kept in LIFO order, check the last one */
+		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+
+		if (time_before(jiffies, expires))
+			mod_timer(&gcwq->idle_timer, expires);
+		else {
+			/* it's been idle for too long, wake up manager */
+			gcwq->flags |= GCWQ_MANAGE_WORKERS;
+			wake_up_worker(gcwq);
 		}
+	}
 
-		spin_lock_irq(&cwq->lock);
-		cwq->current_work = NULL;
+	spin_unlock_irq(&gcwq->lock);
+}
+
+static bool send_mayday(struct work_struct *work)
+{
+	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+	struct workqueue_struct *wq = cwq->wq;
+	unsigned int cpu;
+
+	if (!(wq->flags & WQ_RESCUER))
+		return false;
+
+	/* mayday mayday mayday */
+	cpu = cwq->gcwq->cpu;
+	/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
+	if (cpu == WORK_CPU_UNBOUND)
+		cpu = 0;
+	if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
+		wake_up_process(wq->rescuer->task);
+	return true;
+}
+
+static void gcwq_mayday_timeout(unsigned long __gcwq)
+{
+	struct global_cwq *gcwq = (void *)__gcwq;
+	struct work_struct *work;
+
+	spin_lock_irq(&gcwq->lock);
+
+	if (need_to_create_worker(gcwq)) {
+		/*
+		 * We've been trying to create a new worker but
+		 * haven't been successful.  We might be hitting an
+		 * allocation deadlock.  Send distress signals to
+		 * rescuers.
+		 */
+		list_for_each_entry(work, &gcwq->worklist, entry)
+			send_mayday(work);
 	}
-	spin_unlock_irq(&cwq->lock);
+
+	spin_unlock_irq(&gcwq->lock);
+
+	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
 
-static int worker_thread(void *__cwq)
+/**
+ * maybe_create_worker - create a new worker if necessary
+ * @gcwq: gcwq to create a new worker for
+ *
+ * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
+ * have at least one idle worker on return from this function.  If
+ * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
+ * sent to all rescuers with works scheduled on @gcwq to resolve
+ * possible allocation deadlock.
+ *
+ * On return, need_to_create_worker() is guaranteed to be false and
+ * may_start_working() true.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  Does GFP_KERNEL allocations.  Called only from
+ * manager.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true
+ * otherwise.
+ */
+static bool maybe_create_worker(struct global_cwq *gcwq)
+__releases(&gcwq->lock)
+__acquires(&gcwq->lock)
 {
-	struct cpu_workqueue_struct *cwq = __cwq;
-	DEFINE_WAIT(wait);
+	if (!need_to_create_worker(gcwq))
+		return false;
+restart:
+	spin_unlock_irq(&gcwq->lock);
+
+	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
+	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+
+	while (true) {
+		struct worker *worker;
+
+		worker = create_worker(gcwq, true);
+		if (worker) {
+			del_timer_sync(&gcwq->mayday_timer);
+			spin_lock_irq(&gcwq->lock);
+			start_worker(worker);
+			BUG_ON(need_to_create_worker(gcwq));
+			return true;
+		}
+
+		if (!need_to_create_worker(gcwq))
+			break;
+
+		__set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(CREATE_COOLDOWN);
 
-	if (cwq->wq->freezeable)
-		set_freezable();
+		if (!need_to_create_worker(gcwq))
+			break;
+	}
+
+	del_timer_sync(&gcwq->mayday_timer);
+	spin_lock_irq(&gcwq->lock);
+	if (need_to_create_worker(gcwq))
+		goto restart;
+	return true;
+}
+
+/**
+ * maybe_destroy_worker - destroy workers which have been idle for a while
+ * @gcwq: gcwq to destroy workers for
+ *
+ * Destroy @gcwq workers which have been idle for longer than
+ * IDLE_WORKER_TIMEOUT.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  Called only from manager.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true
+ * otherwise.
+ */
+static bool maybe_destroy_workers(struct global_cwq *gcwq)
+{
+	bool ret = false;
 
-	for (;;) {
-		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
-		if (!freezing(current) &&
-		    !kthread_should_stop() &&
-		    list_empty(&cwq->worklist))
-			schedule();
-		finish_wait(&cwq->more_work, &wait);
+	while (too_many_workers(gcwq)) {
+		struct worker *worker;
+		unsigned long expires;
 
-		try_to_freeze();
+		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
-		if (kthread_should_stop())
+		if (time_before(jiffies, expires)) {
+			mod_timer(&gcwq->idle_timer, expires);
 			break;
+		}
 
-		run_workqueue(cwq);
+		destroy_worker(worker);
+		ret = true;
 	}
 
-	return 0;
+	return ret;
+}
+
+/**
+ * manage_workers - manage worker pool
+ * @worker: self
+ *
+ * Assume the manager role and manage gcwq worker pool @worker belongs
+ * to.  At any given time, there can be only zero or one manager per
+ * gcwq.  The exclusion is handled automatically by this function.
+ *
+ * The caller can safely start processing works on false return.  On
+ * true return, it's guaranteed that need_to_create_worker() is false
+ * and may_start_working() is true.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  Does GFP_KERNEL allocations.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true if
+ * some action was taken.
+ */
+static bool manage_workers(struct worker *worker)
+{
+	struct global_cwq *gcwq = worker->gcwq;
+	bool ret = false;
+
+	if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+		return ret;
+
+	gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
+	gcwq->flags |= GCWQ_MANAGING_WORKERS;
+
+	/*
+	 * Destroy and then create so that may_start_working() is true
+	 * on return.
+	 */
+	ret |= maybe_destroy_workers(gcwq);
+	ret |= maybe_create_worker(gcwq);
+
+	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+
+	/*
+	 * The trustee might be waiting to take over the manager
+	 * position, tell it we're done.
+	 */
+	if (unlikely(gcwq->trustee))
+		wake_up_all(&gcwq->trustee_wait);
+
+	return ret;
+}
+
+/**
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head.  Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work.  This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+			      struct work_struct **nextp)
+{
+	struct work_struct *n;
+
+	/*
+	 * Linked worklist will always end before the end of the list,
+	 * use NULL for list head.
+	 */
+	list_for_each_entry_safe_from(work, n, NULL, entry) {
+		list_move_tail(&work->entry, head);
+		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
+			break;
+	}
+
+	/*
+	 * If we're already inside safe list traversal and have moved
+	 * multiple works to the scheduled queue, the next position
+	 * needs to be updated.
+	 */
+	if (nextp)
+		*nextp = n;
+}
+
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+	struct work_struct *work = list_first_entry(&cwq->delayed_works,
+						    struct work_struct, entry);
+	struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
+
+	move_linked_works(work, pos, NULL);
+	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
+	cwq->nr_active++;
+}
+
+/**
+ * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * @cwq: cwq of interest
+ * @color: color of work which left the queue
+ * @delayed: for a delayed work
+ *
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
+				 bool delayed)
+{
+	/* ignore uncolored works */
+	if (color == WORK_NO_COLOR)
+		return;
+
+	cwq->nr_in_flight[color]--;
+
+	if (!delayed) {
+		cwq->nr_active--;
+		if (!list_empty(&cwq->delayed_works)) {
+			/* one down, submit a delayed one */
+			if (cwq->nr_active < cwq->max_active)
+				cwq_activate_first_delayed(cwq);
+		}
+	}
+
+	/* is flush in progress and are we at the flushing tip? */
+	if (likely(cwq->flush_color != color))
+		return;
+
+	/* are there still in-flight works? */
+	if (cwq->nr_in_flight[color])
+		return;
+
+	/* this cwq is done, clear flush_color */
+	cwq->flush_color = -1;
+
+	/*
+	 * If this was the last cwq, wake up the first flusher.  It
+	 * will handle the rest.
+	 */
+	if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+		complete(&cwq->wq->first_flusher->done);
+}
+
+/**
+ * process_one_work - process single work
+ * @worker: self
+ * @work: work to process
+ *
+ * Process @work.  This function contains all the logics necessary to
+ * process a single work including synchronization against and
+ * interaction with other workers on the same cpu, queueing and
+ * flushing.  As long as context requirement is met, any worker can
+ * call this function to process a work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ */
+static void process_one_work(struct worker *worker, struct work_struct *work)
+__releases(&gcwq->lock)
+__acquires(&gcwq->lock)
+{
+	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+	struct global_cwq *gcwq = cwq->gcwq;
+	struct hlist_head *bwh = busy_worker_head(gcwq, work);
+	bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
+	work_func_t f = work->func;
+	int work_color;
+	struct worker *collision;
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * It is permissible to free the struct work_struct from
+	 * inside the function that is called from it, this we need to
+	 * take into account for lockdep too.  To avoid bogus "held
+	 * lock freed" warnings as well as problems when looking into
+	 * work->lockdep_map, make a copy and use that here.
+	 */
+	struct lockdep_map lockdep_map = work->lockdep_map;
+#endif
+	/*
+	 * A single work shouldn't be executed concurrently by
+	 * multiple workers on a single cpu.  Check whether anyone is
+	 * already processing the work.  If so, defer the work to the
+	 * currently executing one.
+	 */
+	collision = __find_worker_executing_work(gcwq, bwh, work);
+	if (unlikely(collision)) {
+		move_linked_works(work, &collision->scheduled, NULL);
+		return;
+	}
+
+	/* claim and process */
+	debug_work_deactivate(work);
+	hlist_add_head(&worker->hentry, bwh);
+	worker->current_work = work;
+	worker->current_cwq = cwq;
+	work_color = get_work_color(work);
+
+	/* record the current cpu number in the work data and dequeue */
+	set_work_cpu(work, gcwq->cpu);
+	list_del_init(&work->entry);
+
+	/*
+	 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
+	 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
+	 */
+	if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
+		struct work_struct *nwork = list_first_entry(&gcwq->worklist,
+						struct work_struct, entry);
+
+		if (!list_empty(&gcwq->worklist) &&
+		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
+			wake_up_worker(gcwq);
+		else
+			gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
+	}
+
+	/*
+	 * CPU intensive works don't participate in concurrency
+	 * management.  They're the scheduler's responsibility.
+	 */
+	if (unlikely(cpu_intensive))
+		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+
+	spin_unlock_irq(&gcwq->lock);
+
+	work_clear_pending(work);
+	lock_map_acquire(&cwq->wq->lockdep_map);
+	lock_map_acquire(&lockdep_map);
+	trace_workqueue_execute_start(work);
+	f(work);
+	/*
+	 * While we must be careful to not use "work" after this, the trace
+	 * point will only record its address.
+	 */
+	trace_workqueue_execute_end(work);
+	lock_map_release(&lockdep_map);
+	lock_map_release(&cwq->wq->lockdep_map);
+
+	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
+		printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+		       "%s/0x%08x/%d\n",
+		       current->comm, preempt_count(), task_pid_nr(current));
+		printk(KERN_ERR "    last function: ");
+		print_symbol("%s\n", (unsigned long)f);
+		debug_show_held_locks(current);
+		dump_stack();
+	}
+
+	spin_lock_irq(&gcwq->lock);
+
+	/* clear cpu intensive status */
+	if (unlikely(cpu_intensive))
+		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+
+	/* we're done with it, release */
+	hlist_del_init(&worker->hentry);
+	worker->current_work = NULL;
+	worker->current_cwq = NULL;
+	cwq_dec_nr_in_flight(cwq, work_color, false);
+}
+
+/**
+ * process_scheduled_works - process scheduled works
+ * @worker: self
+ *
+ * Process all scheduled works.  Please note that the scheduled list
+ * may change while processing a work, so this function repeatedly
+ * fetches a work from the top and executes it.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.
+ */
+static void process_scheduled_works(struct worker *worker)
+{
+	while (!list_empty(&worker->scheduled)) {
+		struct work_struct *work = list_first_entry(&worker->scheduled,
+						struct work_struct, entry);
+		process_one_work(worker, work);
+	}
+}
+
+/**
+ * worker_thread - the worker thread function
+ * @__worker: self
+ *
+ * The gcwq worker thread function.  There's a single dynamic pool of
+ * these per each cpu.  These workers process all works regardless of
+ * their specific target workqueue.  The only exception is works which
+ * belong to workqueues with a rescuer which will be explained in
+ * rescuer_thread().
+ */
+static int worker_thread(void *__worker)
+{
+	struct worker *worker = __worker;
+	struct global_cwq *gcwq = worker->gcwq;
+
+	/* tell the scheduler that this is a workqueue worker */
+	worker->task->flags |= PF_WQ_WORKER;
+woke_up:
+	spin_lock_irq(&gcwq->lock);
+
+	/* DIE can be set only while we're idle, checking here is enough */
+	if (worker->flags & WORKER_DIE) {
+		spin_unlock_irq(&gcwq->lock);
+		worker->task->flags &= ~PF_WQ_WORKER;
+		return 0;
+	}
+
+	worker_leave_idle(worker);
+recheck:
+	/* no more worker necessary? */
+	if (!need_more_worker(gcwq))
+		goto sleep;
+
+	/* do we need to manage? */
+	if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+		goto recheck;
+
+	/*
+	 * ->scheduled list can only be filled while a worker is
+	 * preparing to process a work or actually processing it.
+	 * Make sure nobody diddled with it while I was sleeping.
+	 */
+	BUG_ON(!list_empty(&worker->scheduled));
+
+	/*
+	 * When control reaches this point, we're guaranteed to have
+	 * at least one idle worker or that someone else has already
+	 * assumed the manager role.
+	 */
+	worker_clr_flags(worker, WORKER_PREP);
+
+	do {
+		struct work_struct *work =
+			list_first_entry(&gcwq->worklist,
+					 struct work_struct, entry);
+
+		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
+			/* optimization path, not strictly necessary */
+			process_one_work(worker, work);
+			if (unlikely(!list_empty(&worker->scheduled)))
+				process_scheduled_works(worker);
+		} else {
+			move_linked_works(work, &worker->scheduled, NULL);
+			process_scheduled_works(worker);
+		}
+	} while (keep_working(gcwq));
+
+	worker_set_flags(worker, WORKER_PREP, false);
+sleep:
+	if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+		goto recheck;
+
+	/*
+	 * gcwq->lock is held and there's no work to process and no
+	 * need to manage, sleep.  Workers are woken up only while
+	 * holding gcwq->lock or from local cpu, so setting the
+	 * current state before releasing gcwq->lock is enough to
+	 * prevent losing any event.
+	 */
+	worker_enter_idle(worker);
+	__set_current_state(TASK_INTERRUPTIBLE);
+	spin_unlock_irq(&gcwq->lock);
+	schedule();
+	goto woke_up;
+}
+
+/**
+ * rescuer_thread - the rescuer thread function
+ * @__wq: the associated workqueue
+ *
+ * Workqueue rescuer thread function.  There's one rescuer for each
+ * workqueue which has WQ_RESCUER set.
+ *
+ * Regular work processing on a gcwq may block trying to create a new
+ * worker which uses GFP_KERNEL allocation which has slight chance of
+ * developing into deadlock if some works currently on the same queue
+ * need to be processed to satisfy the GFP_KERNEL allocation.  This is
+ * the problem rescuer solves.
+ *
+ * When such condition is possible, the gcwq summons rescuers of all
+ * workqueues which have works queued on the gcwq and let them process
+ * those works so that forward progress can be guaranteed.
+ *
+ * This should happen rarely.
+ */
+static int rescuer_thread(void *__wq)
+{
+	struct workqueue_struct *wq = __wq;
+	struct worker *rescuer = wq->rescuer;
+	struct list_head *scheduled = &rescuer->scheduled;
+	bool is_unbound = wq->flags & WQ_UNBOUND;
+	unsigned int cpu;
+
+	set_user_nice(current, RESCUER_NICE_LEVEL);
+repeat:
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	if (kthread_should_stop())
+		return 0;
+
+	/*
+	 * See whether any cpu is asking for help.  Unbounded
+	 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
+	 */
+	for_each_mayday_cpu(cpu, wq->mayday_mask) {
+		unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
+		struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
+		struct global_cwq *gcwq = cwq->gcwq;
+		struct work_struct *work, *n;
+
+		__set_current_state(TASK_RUNNING);
+		mayday_clear_cpu(cpu, wq->mayday_mask);
+
+		/* migrate to the target cpu if possible */
+		rescuer->gcwq = gcwq;
+		worker_maybe_bind_and_lock(rescuer);
+
+		/*
+		 * Slurp in all works issued via this workqueue and
+		 * process'em.
+		 */
+		BUG_ON(!list_empty(&rescuer->scheduled));
+		list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+			if (get_work_cwq(work) == cwq)
+				move_linked_works(work, scheduled, &n);
+
+		process_scheduled_works(rescuer);
+		spin_unlock_irq(&gcwq->lock);
+	}
+
+	schedule();
+	goto repeat;
 }
 
 struct wq_barrier {
@@ -459,44 +2037,137 @@ static void wq_barrier_func(struct work_struct *work)
 	complete(&barr->done);
 }
 
+/**
+ * insert_wq_barrier - insert a barrier work
+ * @cwq: cwq to insert barrier into
+ * @barr: wq_barrier to insert
+ * @target: target work to attach @barr to
+ * @worker: worker currently executing @target, NULL if @target is not executing
+ *
+ * @barr is linked to @target such that @barr is completed only after
+ * @target finishes execution.  Please note that the ordering
+ * guarantee is observed only with respect to @target and on the local
+ * cpu.
+ *
+ * Currently, a queued barrier can't be canceled.  This is because
+ * try_to_grab_pending() can't determine whether the work to be
+ * grabbed is at the head of the queue and thus can't clear LINKED
+ * flag of the previous work while there must be a valid next work
+ * after a work with LINKED flag set.
+ *
+ * Note that when @worker is non-NULL, @target may be modified
+ * underneath us, so we can't reliably determine cwq from @target.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
-			struct wq_barrier *barr, struct list_head *head)
+			      struct wq_barrier *barr,
+			      struct work_struct *target, struct worker *worker)
 {
+	struct list_head *head;
+	unsigned int linked = 0;
+
 	/*
-	 * debugobject calls are safe here even with cwq->lock locked
+	 * debugobject calls are safe here even with gcwq->lock locked
 	 * as we know for sure that this will not trigger any of the
 	 * checks and call back into the fixup functions where we
 	 * might deadlock.
 	 */
 	INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
-	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
-
+	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
 	init_completion(&barr->done);
 
+	/*
+	 * If @target is currently being executed, schedule the
+	 * barrier to the worker; otherwise, put it after @target.
+	 */
+	if (worker)
+		head = worker->scheduled.next;
+	else {
+		unsigned long *bits = work_data_bits(target);
+
+		head = target->entry.next;
+		/* there can already be other linked works, inherit and set */
+		linked = *bits & WORK_STRUCT_LINKED;
+		__set_bit(WORK_STRUCT_LINKED_BIT, bits);
+	}
+
 	debug_work_activate(&barr->work);
-	insert_work(cwq, &barr->work, head);
+	insert_work(cwq, &barr->work, head,
+		    work_color_to_flags(WORK_NO_COLOR) | linked);
 }
 
-static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
+/**
+ * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
+ * @wq: workqueue being flushed
+ * @flush_color: new flush color, < 0 for no-op
+ * @work_color: new work color, < 0 for no-op
+ *
+ * Prepare cwqs for workqueue flushing.
+ *
+ * If @flush_color is non-negative, flush_color on all cwqs should be
+ * -1.  If no cwq has in-flight commands at the specified color, all
+ * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
+ * has in flight commands, its cwq->flush_color is set to
+ * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
+ * wakeup logic is armed and %true is returned.
+ *
+ * The caller should have initialized @wq->first_flusher prior to
+ * calling this function with non-negative @flush_color.  If
+ * @flush_color is negative, no flush color update is done and %false
+ * is returned.
+ *
+ * If @work_color is non-negative, all cwqs should have the same
+ * work_color which is previous to @work_color and all will be
+ * advanced to @work_color.
+ *
+ * CONTEXT:
+ * mutex_lock(wq->flush_mutex).
+ *
+ * RETURNS:
+ * %true if @flush_color >= 0 and there's something to flush.  %false
+ * otherwise.
+ */
+static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
+				      int flush_color, int work_color)
 {
-	int active = 0;
-	struct wq_barrier barr;
-
-	WARN_ON(cwq->thread == current);
+	bool wait = false;
+	unsigned int cpu;
 
-	spin_lock_irq(&cwq->lock);
-	if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
-		insert_wq_barrier(cwq, &barr, &cwq->worklist);
-		active = 1;
+	if (flush_color >= 0) {
+		BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
+		atomic_set(&wq->nr_cwqs_to_flush, 1);
 	}
-	spin_unlock_irq(&cwq->lock);
 
-	if (active) {
-		wait_for_completion(&barr.done);
-		destroy_work_on_stack(&barr.work);
+	for_each_cwq_cpu(cpu, wq) {
+		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+		struct global_cwq *gcwq = cwq->gcwq;
+
+		spin_lock_irq(&gcwq->lock);
+
+		if (flush_color >= 0) {
+			BUG_ON(cwq->flush_color != -1);
+
+			if (cwq->nr_in_flight[flush_color]) {
+				cwq->flush_color = flush_color;
+				atomic_inc(&wq->nr_cwqs_to_flush);
+				wait = true;
+			}
+		}
+
+		if (work_color >= 0) {
+			BUG_ON(work_color != work_next_color(cwq->work_color));
+			cwq->work_color = work_color;
+		}
+
+		spin_unlock_irq(&gcwq->lock);
 	}
 
-	return active;
+	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
+		complete(&wq->first_flusher->done);
+
+	return wait;
 }
 
 /**
@@ -508,20 +2179,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  *
  * We sleep until all works which were queued on entry have been handled,
  * but we are not livelocked by new incoming ones.
- *
- * This function used to run the workqueues itself.  Now we just wait for the
- * helper threads to do it.
  */
 void flush_workqueue(struct workqueue_struct *wq)
 {
-	const struct cpumask *cpu_map = wq_cpu_map(wq);
-	int cpu;
+	struct wq_flusher this_flusher = {
+		.list = LIST_HEAD_INIT(this_flusher.list),
+		.flush_color = -1,
+		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+	};
+	int next_color;
 
-	might_sleep();
 	lock_map_acquire(&wq->lockdep_map);
 	lock_map_release(&wq->lockdep_map);
-	for_each_cpu(cpu, cpu_map)
-		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
+
+	mutex_lock(&wq->flush_mutex);
+
+	/*
+	 * Start-to-wait phase
+	 */
+	next_color = work_next_color(wq->work_color);
+
+	if (next_color != wq->flush_color) {
+		/*
+		 * Color space is not full.  The current work_color
+		 * becomes our flush_color and work_color is advanced
+		 * by one.
+		 */
+		BUG_ON(!list_empty(&wq->flusher_overflow));
+		this_flusher.flush_color = wq->work_color;
+		wq->work_color = next_color;
+
+		if (!wq->first_flusher) {
+			/* no flush in progress, become the first flusher */
+			BUG_ON(wq->flush_color != this_flusher.flush_color);
+
+			wq->first_flusher = &this_flusher;
+
+			if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
+						       wq->work_color)) {
+				/* nothing to flush, done */
+				wq->flush_color = next_color;
+				wq->first_flusher = NULL;
+				goto out_unlock;
+			}
+		} else {
+			/* wait in queue */
+			BUG_ON(wq->flush_color == this_flusher.flush_color);
+			list_add_tail(&this_flusher.list, &wq->flusher_queue);
+			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+		}
+	} else {
+		/*
+		 * Oops, color space is full, wait on overflow queue.
+		 * The next flush completion will assign us
+		 * flush_color and transfer to flusher_queue.
+		 */
+		list_add_tail(&this_flusher.list, &wq->flusher_overflow);
+	}
+
+	mutex_unlock(&wq->flush_mutex);
+
+	wait_for_completion(&this_flusher.done);
+
+	/*
+	 * Wake-up-and-cascade phase
+	 *
+	 * First flushers are responsible for cascading flushes and
+	 * handling overflow.  Non-first flushers can simply return.
+	 */
+	if (wq->first_flusher != &this_flusher)
+		return;
+
+	mutex_lock(&wq->flush_mutex);
+
+	/* we might have raced, check again with mutex held */
+	if (wq->first_flusher != &this_flusher)
+		goto out_unlock;
+
+	wq->first_flusher = NULL;
+
+	BUG_ON(!list_empty(&this_flusher.list));
+	BUG_ON(wq->flush_color != this_flusher.flush_color);
+
+	while (true) {
+		struct wq_flusher *next, *tmp;
+
+		/* complete all the flushers sharing the current flush color */
+		list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
+			if (next->flush_color != wq->flush_color)
+				break;
+			list_del_init(&next->list);
+			complete(&next->done);
+		}
+
+		BUG_ON(!list_empty(&wq->flusher_overflow) &&
+		       wq->flush_color != work_next_color(wq->work_color));
+
+		/* this flush_color is finished, advance by one */
+		wq->flush_color = work_next_color(wq->flush_color);
+
+		/* one color has been freed, handle overflow queue */
+		if (!list_empty(&wq->flusher_overflow)) {
+			/*
+			 * Assign the same color to all overflowed
+			 * flushers, advance work_color and append to
+			 * flusher_queue.  This is the start-to-wait
+			 * phase for these overflowed flushers.
+			 */
+			list_for_each_entry(tmp, &wq->flusher_overflow, list)
+				tmp->flush_color = wq->work_color;
+
+			wq->work_color = work_next_color(wq->work_color);
+
+			list_splice_tail_init(&wq->flusher_overflow,
+					      &wq->flusher_queue);
+			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+		}
+
+		if (list_empty(&wq->flusher_queue)) {
+			BUG_ON(wq->flush_color != wq->work_color);
+			break;
+		}
+
+		/*
+		 * Need to flush more colors.  Make the next flusher
+		 * the new first flusher and arm cwqs.
+		 */
+		BUG_ON(wq->flush_color == wq->work_color);
+		BUG_ON(wq->flush_color != next->flush_color);
+
+		list_del_init(&next->list);
+		wq->first_flusher = next;
+
+		if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
+			break;
+
+		/*
+		 * Meh... this color is already done, clear first
+		 * flusher and repeat cascading.
+		 */
+		wq->first_flusher = NULL;
+	}
+
+out_unlock:
+	mutex_unlock(&wq->flush_mutex);
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
@@ -537,43 +2338,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
  */
 int flush_work(struct work_struct *work)
 {
+	struct worker *worker = NULL;
+	struct global_cwq *gcwq;
 	struct cpu_workqueue_struct *cwq;
-	struct list_head *prev;
 	struct wq_barrier barr;
 
 	might_sleep();
-	cwq = get_wq_data(work);
-	if (!cwq)
+	gcwq = get_work_gcwq(work);
+	if (!gcwq)
 		return 0;
 
-	lock_map_acquire(&cwq->wq->lockdep_map);
-	lock_map_release(&cwq->wq->lockdep_map);
-
-	prev = NULL;
-	spin_lock_irq(&cwq->lock);
+	spin_lock_irq(&gcwq->lock);
 	if (!list_empty(&work->entry)) {
 		/*
 		 * See the comment near try_to_grab_pending()->smp_rmb().
-		 * If it was re-queued under us we are not going to wait.
+		 * If it was re-queued to a different gcwq under us, we
+		 * are not going to wait.
 		 */
 		smp_rmb();
-		if (unlikely(cwq != get_wq_data(work)))
-			goto out;
-		prev = &work->entry;
+		cwq = get_work_cwq(work);
+		if (unlikely(!cwq || gcwq != cwq->gcwq))
+			goto already_gone;
 	} else {
-		if (cwq->current_work != work)
-			goto out;
-		prev = &cwq->worklist;
+		worker = find_worker_executing_work(gcwq, work);
+		if (!worker)
+			goto already_gone;
+		cwq = worker->current_cwq;
 	}
-	insert_wq_barrier(cwq, &barr, prev->next);
-out:
-	spin_unlock_irq(&cwq->lock);
-	if (!prev)
-		return 0;
+
+	insert_wq_barrier(cwq, &barr, work, worker);
+	spin_unlock_irq(&gcwq->lock);
+
+	lock_map_acquire(&cwq->wq->lockdep_map);
+	lock_map_release(&cwq->wq->lockdep_map);
 
 	wait_for_completion(&barr.done);
 	destroy_work_on_stack(&barr.work);
 	return 1;
+already_gone:
+	spin_unlock_irq(&gcwq->lock);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(flush_work);
 
@@ -583,54 +2387,56 @@ EXPORT_SYMBOL_GPL(flush_work);
  */
 static int try_to_grab_pending(struct work_struct *work)
 {
-	struct cpu_workqueue_struct *cwq;
+	struct global_cwq *gcwq;
 	int ret = -1;
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
 		return 0;
 
 	/*
 	 * The queueing is in progress, or it is already queued. Try to
 	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
 	 */
-
-	cwq = get_wq_data(work);
-	if (!cwq)
+	gcwq = get_work_gcwq(work);
+	if (!gcwq)
 		return ret;
 
-	spin_lock_irq(&cwq->lock);
+	spin_lock_irq(&gcwq->lock);
 	if (!list_empty(&work->entry)) {
 		/*
-		 * This work is queued, but perhaps we locked the wrong cwq.
+		 * This work is queued, but perhaps we locked the wrong gcwq.
 		 * In that case we must see the new value after rmb(), see
 		 * insert_work()->wmb().
 		 */
 		smp_rmb();
-		if (cwq == get_wq_data(work)) {
+		if (gcwq == get_work_gcwq(work)) {
 			debug_work_deactivate(work);
 			list_del_init(&work->entry);
+			cwq_dec_nr_in_flight(get_work_cwq(work),
+				get_work_color(work),
+				*work_data_bits(work) & WORK_STRUCT_DELAYED);
 			ret = 1;
 		}
 	}
-	spin_unlock_irq(&cwq->lock);
+	spin_unlock_irq(&gcwq->lock);
 
 	return ret;
 }
 
-static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
-				struct work_struct *work)
+static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
 {
 	struct wq_barrier barr;
-	int running = 0;
+	struct worker *worker;
 
-	spin_lock_irq(&cwq->lock);
-	if (unlikely(cwq->current_work == work)) {
-		insert_wq_barrier(cwq, &barr, cwq->worklist.next);
-		running = 1;
-	}
-	spin_unlock_irq(&cwq->lock);
+	spin_lock_irq(&gcwq->lock);
+
+	worker = find_worker_executing_work(gcwq, work);
+	if (unlikely(worker))
+		insert_wq_barrier(worker->current_cwq, &barr, work, worker);
 
-	if (unlikely(running)) {
+	spin_unlock_irq(&gcwq->lock);
+
+	if (unlikely(worker)) {
 		wait_for_completion(&barr.done);
 		destroy_work_on_stack(&barr.work);
 	}
@@ -638,9 +2444,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
 
 static void wait_on_work(struct work_struct *work)
 {
-	struct cpu_workqueue_struct *cwq;
-	struct workqueue_struct *wq;
-	const struct cpumask *cpu_map;
 	int cpu;
 
 	might_sleep();
@@ -648,15 +2451,8 @@ static void wait_on_work(struct work_struct *work)
 	lock_map_acquire(&work->lockdep_map);
 	lock_map_release(&work->lockdep_map);
 
-	cwq = get_wq_data(work);
-	if (!cwq)
-		return;
-
-	wq = cwq->wq;
-	cpu_map = wq_cpu_map(wq);
-
-	for_each_cpu(cpu, cpu_map)
-		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+	for_each_gcwq_cpu(cpu)
+		wait_on_cpu_work(get_gcwq(cpu), work);
 }
 
 static int __cancel_work_timer(struct work_struct *work,
@@ -671,7 +2467,7 @@ static int __cancel_work_timer(struct work_struct *work,
 		wait_on_work(work);
 	} while (unlikely(ret < 0));
 
-	work_clear_pending(work);
+	clear_work_data(work);
 	return ret;
 }
 
@@ -717,8 +2513,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork)
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
-static struct workqueue_struct *keventd_wq __read_mostly;
-
 /**
  * schedule_work - put work task in global workqueue
  * @work: job to be done
@@ -732,7 +2526,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
  */
 int schedule_work(struct work_struct *work)
 {
-	return queue_work(keventd_wq, work);
+	return queue_work(system_wq, work);
 }
 EXPORT_SYMBOL(schedule_work);
 
@@ -745,7 +2539,7 @@ EXPORT_SYMBOL(schedule_work);
  */
 int schedule_work_on(int cpu, struct work_struct *work)
 {
-	return queue_work_on(cpu, keventd_wq, work);
+	return queue_work_on(cpu, system_wq, work);
 }
 EXPORT_SYMBOL(schedule_work_on);
 
@@ -760,7 +2554,7 @@ EXPORT_SYMBOL(schedule_work_on);
 int schedule_delayed_work(struct delayed_work *dwork,
 					unsigned long delay)
 {
-	return queue_delayed_work(keventd_wq, dwork, delay);
+	return queue_delayed_work(system_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 
@@ -773,9 +2567,8 @@ EXPORT_SYMBOL(schedule_delayed_work);
 void flush_delayed_work(struct delayed_work *dwork)
 {
 	if (del_timer_sync(&dwork->timer)) {
-		struct cpu_workqueue_struct *cwq;
-		cwq = wq_per_cpu(keventd_wq, get_cpu());
-		__queue_work(cwq, &dwork->work);
+		__queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
+			     &dwork->work);
 		put_cpu();
 	}
 	flush_work(&dwork->work);
@@ -794,7 +2587,7 @@ EXPORT_SYMBOL(flush_delayed_work);
 int schedule_delayed_work_on(int cpu,
 			struct delayed_work *dwork, unsigned long delay)
 {
-	return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
+	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
 
@@ -810,8 +2603,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 int schedule_on_each_cpu(work_func_t func)
 {
 	int cpu;
-	int orig = -1;
-	struct work_struct *works;
+	struct work_struct __percpu *works;
 
 	works = alloc_percpu(struct work_struct);
 	if (!works)
@@ -819,23 +2611,12 @@ int schedule_on_each_cpu(work_func_t func)
 
 	get_online_cpus();
 
-	/*
-	 * When running in keventd don't schedule a work item on
-	 * itself.  Can just call directly because the work queue is
-	 * already bound.  This also is faster.
-	 */
-	if (current_is_keventd())
-		orig = raw_smp_processor_id();
-
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
-		if (cpu != orig)
-			schedule_work_on(cpu, work);
+		schedule_work_on(cpu, work);
 	}
-	if (orig >= 0)
-		func(per_cpu_ptr(works, orig));
 
 	for_each_online_cpu(cpu)
 		flush_work(per_cpu_ptr(works, cpu));
@@ -845,9 +2626,33 @@ int schedule_on_each_cpu(work_func_t func)
 	return 0;
 }
 
+/**
+ * flush_scheduled_work - ensure that any scheduled work has run to completion.
+ *
+ * Forces execution of the kernel-global workqueue and blocks until its
+ * completion.
+ *
+ * Think twice before calling this function!  It's very easy to get into
+ * trouble if you don't take great care.  Either of the following situations
+ * will lead to deadlock:
+ *
+ *	One of the work items currently on the workqueue needs to acquire
+ *	a lock held by your code or its caller.
+ *
+ *	Your code is running in the context of a work routine.
+ *
+ * They will be detected by lockdep when they occur, but the first might not
+ * occur very often.  It depends on what work items are on the workqueue and
+ * what locks they need, which you have no control over.
+ *
+ * In most situations flushing the entire workqueue is overkill; you merely
+ * need to know that a particular work item isn't queued and isn't running.
+ * In such cases you should use cancel_delayed_work_sync() or
+ * cancel_work_sync() instead.
+ */
 void flush_scheduled_work(void)
 {
-	flush_workqueue(keventd_wq);
+	flush_workqueue(system_wq);
 }
 EXPORT_SYMBOL(flush_scheduled_work);
 
@@ -879,170 +2684,169 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
 
 int keventd_up(void)
 {
-	return keventd_wq != NULL;
+	return system_wq != NULL;
 }
 
-int current_is_keventd(void)
+static int alloc_cwqs(struct workqueue_struct *wq)
 {
-	struct cpu_workqueue_struct *cwq;
-	int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */
-	int ret = 0;
-
-	BUG_ON(!keventd_wq);
+	/*
+	 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
+	 * Make sure that the alignment isn't lower than that of
+	 * unsigned long long.
+	 */
+	const size_t size = sizeof(struct cpu_workqueue_struct);
+	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
+				   __alignof__(unsigned long long));
+#ifdef CONFIG_SMP
+	bool percpu = !(wq->flags & WQ_UNBOUND);
+#else
+	bool percpu = false;
+#endif
 
-	cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
-	if (current == cwq->thread)
-		ret = 1;
+	if (percpu)
+		wq->cpu_wq.pcpu = __alloc_percpu(size, align);
+	else {
+		void *ptr;
 
-	return ret;
+		/*
+		 * Allocate enough room to align cwq and put an extra
+		 * pointer at the end pointing back to the originally
+		 * allocated pointer which will be used for free.
+		 */
+		ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
+		if (ptr) {
+			wq->cpu_wq.single = PTR_ALIGN(ptr, align);
+			*(void **)(wq->cpu_wq.single + 1) = ptr;
+		}
+	}
 
+	/* just in case, make sure it's actually aligned */
+	BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
+	return wq->cpu_wq.v ? 0 : -ENOMEM;
 }
 
-static struct cpu_workqueue_struct *
-init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
+static void free_cwqs(struct workqueue_struct *wq)
 {
-	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-
-	cwq->wq = wq;
-	spin_lock_init(&cwq->lock);
-	INIT_LIST_HEAD(&cwq->worklist);
-	init_waitqueue_head(&cwq->more_work);
+#ifdef CONFIG_SMP
+	bool percpu = !(wq->flags & WQ_UNBOUND);
+#else
+	bool percpu = false;
+#endif
 
-	return cwq;
+	if (percpu)
+		free_percpu(wq->cpu_wq.pcpu);
+	else if (wq->cpu_wq.single) {
+		/* the pointer to free is stored right after the cwq */
+		kfree(*(void **)(wq->cpu_wq.single + 1));
+	}
 }
 
-static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+static int wq_clamp_max_active(int max_active, unsigned int flags,
+			       const char *name)
 {
-	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-	struct workqueue_struct *wq = cwq->wq;
-	const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
-	struct task_struct *p;
+	int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
 
-	p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
-	/*
-	 * Nobody can add the work_struct to this cwq,
-	 *	if (caller is __create_workqueue)
-	 *		nobody should see this wq
-	 *	else // caller is CPU_UP_PREPARE
-	 *		cpu is not on cpu_online_map
-	 * so we can abort safely.
-	 */
-	if (IS_ERR(p))
-		return PTR_ERR(p);
-	if (cwq->wq->rt)
-		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
-	cwq->thread = p;
+	if (max_active < 1 || max_active > lim)
+		printk(KERN_WARNING "workqueue: max_active %d requested for %s "
+		       "is out of range, clamping between %d and %d\n",
+		       max_active, name, 1, lim);
 
-	trace_workqueue_creation(cwq->thread, cpu);
-
-	return 0;
+	return clamp_val(max_active, 1, lim);
 }
 
-static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+struct workqueue_struct *__alloc_workqueue_key(const char *name,
+					       unsigned int flags,
+					       int max_active,
+					       struct lock_class_key *key,
+					       const char *lock_name)
 {
-	struct task_struct *p = cwq->thread;
+	struct workqueue_struct *wq;
+	unsigned int cpu;
 
-	if (p != NULL) {
-		if (cpu >= 0)
-			kthread_bind(p, cpu);
-		wake_up_process(p);
-	}
-}
+	/*
+	 * Unbound workqueues aren't concurrency managed and should be
+	 * dispatched to workers immediately.
+	 */
+	if (flags & WQ_UNBOUND)
+		flags |= WQ_HIGHPRI;
 
-struct workqueue_struct *__create_workqueue_key(const char *name,
-						int singlethread,
-						int freezeable,
-						int rt,
-						struct lock_class_key *key,
-						const char *lock_name)
-{
-	struct workqueue_struct *wq;
-	struct cpu_workqueue_struct *cwq;
-	int err = 0, cpu;
+	max_active = max_active ?: WQ_DFL_ACTIVE;
+	max_active = wq_clamp_max_active(max_active, flags, name);
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
-		return NULL;
+		goto err;
 
-	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
-	if (!wq->cpu_wq) {
-		kfree(wq);
-		return NULL;
-	}
+	wq->flags = flags;
+	wq->saved_max_active = max_active;
+	mutex_init(&wq->flush_mutex);
+	atomic_set(&wq->nr_cwqs_to_flush, 0);
+	INIT_LIST_HEAD(&wq->flusher_queue);
+	INIT_LIST_HEAD(&wq->flusher_overflow);
 
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
-	wq->singlethread = singlethread;
-	wq->freezeable = freezeable;
-	wq->rt = rt;
 	INIT_LIST_HEAD(&wq->list);
 
-	if (singlethread) {
-		cwq = init_cpu_workqueue(wq, singlethread_cpu);
-		err = create_workqueue_thread(cwq, singlethread_cpu);
-		start_workqueue_thread(cwq, -1);
-	} else {
-		cpu_maps_update_begin();
-		/*
-		 * We must place this wq on list even if the code below fails.
-		 * cpu_down(cpu) can remove cpu from cpu_populated_map before
-		 * destroy_workqueue() takes the lock, in that case we leak
-		 * cwq[cpu]->thread.
-		 */
-		spin_lock(&workqueue_lock);
-		list_add(&wq->list, &workqueues);
-		spin_unlock(&workqueue_lock);
-		/*
-		 * We must initialize cwqs for each possible cpu even if we
-		 * are going to call destroy_workqueue() finally. Otherwise
-		 * cpu_up() can hit the uninitialized cwq once we drop the
-		 * lock.
-		 */
-		for_each_possible_cpu(cpu) {
-			cwq = init_cpu_workqueue(wq, cpu);
-			if (err || !cpu_online(cpu))
-				continue;
-			err = create_workqueue_thread(cwq, cpu);
-			start_workqueue_thread(cwq, cpu);
-		}
-		cpu_maps_update_done();
+	if (alloc_cwqs(wq) < 0)
+		goto err;
+
+	for_each_cwq_cpu(cpu, wq) {
+		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+		struct global_cwq *gcwq = get_gcwq(cpu);
+
+		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
+		cwq->gcwq = gcwq;
+		cwq->wq = wq;
+		cwq->flush_color = -1;
+		cwq->max_active = max_active;
+		INIT_LIST_HEAD(&cwq->delayed_works);
 	}
 
-	if (err) {
-		destroy_workqueue(wq);
-		wq = NULL;
+	if (flags & WQ_RESCUER) {
+		struct worker *rescuer;
+
+		if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
+			goto err;
+
+		wq->rescuer = rescuer = alloc_worker();
+		if (!rescuer)
+			goto err;
+
+		rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+		if (IS_ERR(rescuer->task))
+			goto err;
+
+		rescuer->task->flags |= PF_THREAD_BOUND;
+		wake_up_process(rescuer->task);
 	}
-	return wq;
-}
-EXPORT_SYMBOL_GPL(__create_workqueue_key);
 
-static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
-{
 	/*
-	 * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
-	 * cpu_add_remove_lock protects cwq->thread.
+	 * workqueue_lock protects global freeze state and workqueues
+	 * list.  Grab it, set max_active accordingly and add the new
+	 * workqueue to workqueues list.
 	 */
-	if (cwq->thread == NULL)
-		return;
+	spin_lock(&workqueue_lock);
 
-	lock_map_acquire(&cwq->wq->lockdep_map);
-	lock_map_release(&cwq->wq->lockdep_map);
+	if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
+		for_each_cwq_cpu(cpu, wq)
+			get_cwq(cpu, wq)->max_active = 0;
 
-	flush_cpu_workqueue(cwq);
-	/*
-	 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
-	 * a concurrent flush_workqueue() can insert a barrier after us.
-	 * However, in that case run_workqueue() won't return and check
-	 * kthread_should_stop() until it flushes all work_struct's.
-	 * When ->worklist becomes empty it is safe to exit because no
-	 * more work_structs can be queued on this cwq: flush_workqueue
-	 * checks list_empty(), and a "normal" queue_work() can't use
-	 * a dead CPU.
-	 */
-	trace_workqueue_destruction(cwq->thread);
-	kthread_stop(cwq->thread);
-	cwq->thread = NULL;
+	list_add(&wq->list, &workqueues);
+
+	spin_unlock(&workqueue_lock);
+
+	return wq;
+err:
+	if (wq) {
+		free_cwqs(wq);
+		free_mayday_mask(wq->mayday_mask);
+		kfree(wq->rescuer);
+		kfree(wq);
+	}
+	return NULL;
 }
+EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 
 /**
  * destroy_workqueue - safely terminate a workqueue
@@ -1052,71 +2856,520 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
  */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
-	const struct cpumask *cpu_map = wq_cpu_map(wq);
-	int cpu;
+	unsigned int cpu;
 
-	cpu_maps_update_begin();
+	wq->flags |= WQ_DYING;
+	flush_workqueue(wq);
+
+	/*
+	 * wq list is used to freeze wq, remove from list after
+	 * flushing is complete in case freeze races us.
+	 */
 	spin_lock(&workqueue_lock);
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
-	for_each_cpu(cpu, cpu_map)
-		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
- 	cpu_maps_update_done();
+	/* sanity check */
+	for_each_cwq_cpu(cpu, wq) {
+		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+		int i;
+
+		for (i = 0; i < WORK_NR_COLORS; i++)
+			BUG_ON(cwq->nr_in_flight[i]);
+		BUG_ON(cwq->nr_active);
+		BUG_ON(!list_empty(&cwq->delayed_works));
+	}
+
+	if (wq->flags & WQ_RESCUER) {
+		kthread_stop(wq->rescuer->task);
+		free_mayday_mask(wq->mayday_mask);
+		kfree(wq->rescuer);
+	}
 
-	free_percpu(wq->cpu_wq);
+	free_cwqs(wq);
 	kfree(wq);
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
 
+/**
+ * workqueue_set_max_active - adjust max_active of a workqueue
+ * @wq: target workqueue
+ * @max_active: new max_active value.
+ *
+ * Set max_active of @wq to @max_active.
+ *
+ * CONTEXT:
+ * Don't call from IRQ context.
+ */
+void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
+{
+	unsigned int cpu;
+
+	max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
+
+	spin_lock(&workqueue_lock);
+
+	wq->saved_max_active = max_active;
+
+	for_each_cwq_cpu(cpu, wq) {
+		struct global_cwq *gcwq = get_gcwq(cpu);
+
+		spin_lock_irq(&gcwq->lock);
+
+		if (!(wq->flags & WQ_FREEZEABLE) ||
+		    !(gcwq->flags & GCWQ_FREEZING))
+			get_cwq(gcwq->cpu, wq)->max_active = max_active;
+
+		spin_unlock_irq(&gcwq->lock);
+	}
+
+	spin_unlock(&workqueue_lock);
+}
+EXPORT_SYMBOL_GPL(workqueue_set_max_active);
+
+/**
+ * workqueue_congested - test whether a workqueue is congested
+ * @cpu: CPU in question
+ * @wq: target workqueue
+ *
+ * Test whether @wq's cpu workqueue for @cpu is congested.  There is
+ * no synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ *
+ * RETURNS:
+ * %true if congested, %false otherwise.
+ */
+bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
+{
+	struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+	return !list_empty(&cwq->delayed_works);
+}
+EXPORT_SYMBOL_GPL(workqueue_congested);
+
+/**
+ * work_cpu - return the last known associated cpu for @work
+ * @work: the work of interest
+ *
+ * RETURNS:
+ * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise.
+ */
+unsigned int work_cpu(struct work_struct *work)
+{
+	struct global_cwq *gcwq = get_work_gcwq(work);
+
+	return gcwq ? gcwq->cpu : WORK_CPU_NONE;
+}
+EXPORT_SYMBOL_GPL(work_cpu);
+
+/**
+ * work_busy - test whether a work is currently pending or running
+ * @work: the work to be tested
+ *
+ * Test whether @work is currently pending or running.  There is no
+ * synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ * Especially for reentrant wqs, the pending state might hide the
+ * running state.
+ *
+ * RETURNS:
+ * OR'd bitmask of WORK_BUSY_* bits.
+ */
+unsigned int work_busy(struct work_struct *work)
+{
+	struct global_cwq *gcwq = get_work_gcwq(work);
+	unsigned long flags;
+	unsigned int ret = 0;
+
+	if (!gcwq)
+		return false;
+
+	spin_lock_irqsave(&gcwq->lock, flags);
+
+	if (work_pending(work))
+		ret |= WORK_BUSY_PENDING;
+	if (find_worker_executing_work(gcwq, work))
+		ret |= WORK_BUSY_RUNNING;
+
+	spin_unlock_irqrestore(&gcwq->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(work_busy);
+
+/*
+ * CPU hotplug.
+ *
+ * There are two challenges in supporting CPU hotplug.  Firstly, there
+ * are a lot of assumptions on strong associations among work, cwq and
+ * gcwq which make migrating pending and scheduled works very
+ * difficult to implement without impacting hot paths.  Secondly,
+ * gcwqs serve mix of short, long and very long running works making
+ * blocked draining impractical.
+ *
+ * This is solved by allowing a gcwq to be detached from CPU, running
+ * it with unbound (rogue) workers and allowing it to be reattached
+ * later if the cpu comes back online.  A separate thread is created
+ * to govern a gcwq in such state and is called the trustee of the
+ * gcwq.
+ *
+ * Trustee states and their descriptions.
+ *
+ * START	Command state used on startup.  On CPU_DOWN_PREPARE, a
+ *		new trustee is started with this state.
+ *
+ * IN_CHARGE	Once started, trustee will enter this state after
+ *		assuming the manager role and making all existing
+ *		workers rogue.  DOWN_PREPARE waits for trustee to
+ *		enter this state.  After reaching IN_CHARGE, trustee
+ *		tries to execute the pending worklist until it's empty
+ *		and the state is set to BUTCHER, or the state is set
+ *		to RELEASE.
+ *
+ * BUTCHER	Command state which is set by the cpu callback after
+ *		the cpu has went down.  Once this state is set trustee
+ *		knows that there will be no new works on the worklist
+ *		and once the worklist is empty it can proceed to
+ *		killing idle workers.
+ *
+ * RELEASE	Command state which is set by the cpu callback if the
+ *		cpu down has been canceled or it has come online
+ *		again.  After recognizing this state, trustee stops
+ *		trying to drain or butcher and clears ROGUE, rebinds
+ *		all remaining workers back to the cpu and releases
+ *		manager role.
+ *
+ * DONE		Trustee will enter this state after BUTCHER or RELEASE
+ *		is complete.
+ *
+ *          trustee                 CPU                draining
+ *         took over                down               complete
+ * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
+ *                        |                     |                  ^
+ *                        | CPU is back online  v   return workers |
+ *                         ----------------> RELEASE --------------
+ */
+
+/**
+ * trustee_wait_event_timeout - timed event wait for trustee
+ * @cond: condition to wait for
+ * @timeout: timeout in jiffies
+ *
+ * wait_event_timeout() for trustee to use.  Handles locking and
+ * checks for RELEASE request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * Positive indicating left time if @cond is satisfied, 0 if timed
+ * out, -1 if canceled.
+ */
+#define trustee_wait_event_timeout(cond, timeout) ({			\
+	long __ret = (timeout);						\
+	while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) &&	\
+	       __ret) {							\
+		spin_unlock_irq(&gcwq->lock);				\
+		__wait_event_timeout(gcwq->trustee_wait, (cond) ||	\
+			(gcwq->trustee_state == TRUSTEE_RELEASE),	\
+			__ret);						\
+		spin_lock_irq(&gcwq->lock);				\
+	}								\
+	gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);		\
+})
+
+/**
+ * trustee_wait_event - event wait for trustee
+ * @cond: condition to wait for
+ *
+ * wait_event() for trustee to use.  Automatically handles locking and
+ * checks for CANCEL request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * 0 if @cond is satisfied, -1 if canceled.
+ */
+#define trustee_wait_event(cond) ({					\
+	long __ret1;							\
+	__ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
+	__ret1 < 0 ? -1 : 0;						\
+})
+
+static int __cpuinit trustee_thread(void *__gcwq)
+{
+	struct global_cwq *gcwq = __gcwq;
+	struct worker *worker;
+	struct work_struct *work;
+	struct hlist_node *pos;
+	long rc;
+	int i;
+
+	BUG_ON(gcwq->cpu != smp_processor_id());
+
+	spin_lock_irq(&gcwq->lock);
+	/*
+	 * Claim the manager position and make all workers rogue.
+	 * Trustee must be bound to the target cpu and can't be
+	 * cancelled.
+	 */
+	BUG_ON(gcwq->cpu != smp_processor_id());
+	rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
+	BUG_ON(rc < 0);
+
+	gcwq->flags |= GCWQ_MANAGING_WORKERS;
+
+	list_for_each_entry(worker, &gcwq->idle_list, entry)
+		worker->flags |= WORKER_ROGUE;
+
+	for_each_busy_worker(worker, i, pos, gcwq)
+		worker->flags |= WORKER_ROGUE;
+
+	/*
+	 * Call schedule() so that we cross rq->lock and thus can
+	 * guarantee sched callbacks see the rogue flag.  This is
+	 * necessary as scheduler callbacks may be invoked from other
+	 * cpus.
+	 */
+	spin_unlock_irq(&gcwq->lock);
+	schedule();
+	spin_lock_irq(&gcwq->lock);
+
+	/*
+	 * Sched callbacks are disabled now.  Zap nr_running.  After
+	 * this, nr_running stays zero and need_more_worker() and
+	 * keep_working() are always true as long as the worklist is
+	 * not empty.
+	 */
+	atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
+
+	spin_unlock_irq(&gcwq->lock);
+	del_timer_sync(&gcwq->idle_timer);
+	spin_lock_irq(&gcwq->lock);
+
+	/*
+	 * We're now in charge.  Notify and proceed to drain.  We need
+	 * to keep the gcwq running during the whole CPU down
+	 * procedure as other cpu hotunplug callbacks may need to
+	 * flush currently running tasks.
+	 */
+	gcwq->trustee_state = TRUSTEE_IN_CHARGE;
+	wake_up_all(&gcwq->trustee_wait);
+
+	/*
+	 * The original cpu is in the process of dying and may go away
+	 * anytime now.  When that happens, we and all workers would
+	 * be migrated to other cpus.  Try draining any left work.  We
+	 * want to get it over with ASAP - spam rescuers, wake up as
+	 * many idlers as necessary and create new ones till the
+	 * worklist is empty.  Note that if the gcwq is frozen, there
+	 * may be frozen works in freezeable cwqs.  Don't declare
+	 * completion while frozen.
+	 */
+	while (gcwq->nr_workers != gcwq->nr_idle ||
+	       gcwq->flags & GCWQ_FREEZING ||
+	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+		int nr_works = 0;
+
+		list_for_each_entry(work, &gcwq->worklist, entry) {
+			send_mayday(work);
+			nr_works++;
+		}
+
+		list_for_each_entry(worker, &gcwq->idle_list, entry) {
+			if (!nr_works--)
+				break;
+			wake_up_process(worker->task);
+		}
+
+		if (need_to_create_worker(gcwq)) {
+			spin_unlock_irq(&gcwq->lock);
+			worker = create_worker(gcwq, false);
+			spin_lock_irq(&gcwq->lock);
+			if (worker) {
+				worker->flags |= WORKER_ROGUE;
+				start_worker(worker);
+			}
+		}
+
+		/* give a breather */
+		if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
+			break;
+	}
+
+	/*
+	 * Either all works have been scheduled and cpu is down, or
+	 * cpu down has already been canceled.  Wait for and butcher
+	 * all workers till we're canceled.
+	 */
+	do {
+		rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
+		while (!list_empty(&gcwq->idle_list))
+			destroy_worker(list_first_entry(&gcwq->idle_list,
+							struct worker, entry));
+	} while (gcwq->nr_workers && rc >= 0);
+
+	/*
+	 * At this point, either draining has completed and no worker
+	 * is left, or cpu down has been canceled or the cpu is being
+	 * brought back up.  There shouldn't be any idle one left.
+	 * Tell the remaining busy ones to rebind once it finishes the
+	 * currently scheduled works by scheduling the rebind_work.
+	 */
+	WARN_ON(!list_empty(&gcwq->idle_list));
+
+	for_each_busy_worker(worker, i, pos, gcwq) {
+		struct work_struct *rebind_work = &worker->rebind_work;
+
+		/*
+		 * Rebind_work may race with future cpu hotplug
+		 * operations.  Use a separate flag to mark that
+		 * rebinding is scheduled.
+		 */
+		worker->flags |= WORKER_REBIND;
+		worker->flags &= ~WORKER_ROGUE;
+
+		/* queue rebind_work, wq doesn't matter, use the default one */
+		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+				     work_data_bits(rebind_work)))
+			continue;
+
+		debug_work_activate(rebind_work);
+		insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+			    worker->scheduled.next,
+			    work_color_to_flags(WORK_NO_COLOR));
+	}
+
+	/* relinquish manager role */
+	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+
+	/* notify completion */
+	gcwq->trustee = NULL;
+	gcwq->trustee_state = TRUSTEE_DONE;
+	wake_up_all(&gcwq->trustee_wait);
+	spin_unlock_irq(&gcwq->lock);
+	return 0;
+}
+
+/**
+ * wait_trustee_state - wait for trustee to enter the specified state
+ * @gcwq: gcwq the trustee of interest belongs to
+ * @state: target state to wait for
+ *
+ * Wait for the trustee to reach @state.  DONE is already matched.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by cpu_callback.
+ */
+static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
+__releases(&gcwq->lock)
+__acquires(&gcwq->lock)
+{
+	if (!(gcwq->trustee_state == state ||
+	      gcwq->trustee_state == TRUSTEE_DONE)) {
+		spin_unlock_irq(&gcwq->lock);
+		__wait_event(gcwq->trustee_wait,
+			     gcwq->trustee_state == state ||
+			     gcwq->trustee_state == TRUSTEE_DONE);
+		spin_lock_irq(&gcwq->lock);
+	}
+}
+
 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 						unsigned long action,
 						void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct cpu_workqueue_struct *cwq;
-	struct workqueue_struct *wq;
-	int ret = NOTIFY_OK;
+	struct global_cwq *gcwq = get_gcwq(cpu);
+	struct task_struct *new_trustee = NULL;
+	struct worker *uninitialized_var(new_worker);
+	unsigned long flags;
 
 	action &= ~CPU_TASKS_FROZEN;
 
 	switch (action) {
+	case CPU_DOWN_PREPARE:
+		new_trustee = kthread_create(trustee_thread, gcwq,
+					     "workqueue_trustee/%d\n", cpu);
+		if (IS_ERR(new_trustee))
+			return notifier_from_errno(PTR_ERR(new_trustee));
+		kthread_bind(new_trustee, cpu);
+		/* fall through */
 	case CPU_UP_PREPARE:
-		cpumask_set_cpu(cpu, cpu_populated_map);
-	}
-undo:
-	list_for_each_entry(wq, &workqueues, list) {
-		cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-
-		switch (action) {
-		case CPU_UP_PREPARE:
-			if (!create_workqueue_thread(cwq, cpu))
-				break;
-			printk(KERN_ERR "workqueue [%s] for %i failed\n",
-				wq->name, cpu);
-			action = CPU_UP_CANCELED;
-			ret = NOTIFY_BAD;
-			goto undo;
-
-		case CPU_ONLINE:
-			start_workqueue_thread(cwq, cpu);
-			break;
-
-		case CPU_UP_CANCELED:
-			start_workqueue_thread(cwq, -1);
-		case CPU_POST_DEAD:
-			cleanup_workqueue_thread(cwq);
-			break;
+		BUG_ON(gcwq->first_idle);
+		new_worker = create_worker(gcwq, false);
+		if (!new_worker) {
+			if (new_trustee)
+				kthread_stop(new_trustee);
+			return NOTIFY_BAD;
 		}
 	}
 
+	/* some are called w/ irq disabled, don't disturb irq status */
+	spin_lock_irqsave(&gcwq->lock, flags);
+
 	switch (action) {
-	case CPU_UP_CANCELED:
+	case CPU_DOWN_PREPARE:
+		/* initialize trustee and tell it to acquire the gcwq */
+		BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
+		gcwq->trustee = new_trustee;
+		gcwq->trustee_state = TRUSTEE_START;
+		wake_up_process(gcwq->trustee);
+		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
+		/* fall through */
+	case CPU_UP_PREPARE:
+		BUG_ON(gcwq->first_idle);
+		gcwq->first_idle = new_worker;
+		break;
+
+	case CPU_DYING:
+		/*
+		 * Before this, the trustee and all workers except for
+		 * the ones which are still executing works from
+		 * before the last CPU down must be on the cpu.  After
+		 * this, they'll all be diasporas.
+		 */
+		gcwq->flags |= GCWQ_DISASSOCIATED;
+		break;
+
 	case CPU_POST_DEAD:
-		cpumask_clear_cpu(cpu, cpu_populated_map);
+		gcwq->trustee_state = TRUSTEE_BUTCHER;
+		/* fall through */
+	case CPU_UP_CANCELED:
+		destroy_worker(gcwq->first_idle);
+		gcwq->first_idle = NULL;
+		break;
+
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE:
+		gcwq->flags &= ~GCWQ_DISASSOCIATED;
+		if (gcwq->trustee_state != TRUSTEE_DONE) {
+			gcwq->trustee_state = TRUSTEE_RELEASE;
+			wake_up_process(gcwq->trustee);
+			wait_trustee_state(gcwq, TRUSTEE_DONE);
+		}
+
+		/*
+		 * Trustee is done and there might be no worker left.
+		 * Put the first_idle in and request a real manager to
+		 * take a look.
+		 */
+		spin_unlock_irq(&gcwq->lock);
+		kthread_bind(gcwq->first_idle->task, cpu);
+		spin_lock_irq(&gcwq->lock);
+		gcwq->flags |= GCWQ_MANAGE_WORKERS;
+		start_worker(gcwq->first_idle);
+		gcwq->first_idle = NULL;
+		break;
 	}
 
-	return ret;
+	spin_unlock_irqrestore(&gcwq->lock, flags);
+
+	return notifier_from_errno(0);
 }
 
 #ifdef CONFIG_SMP
@@ -1166,14 +3419,200 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 EXPORT_SYMBOL_GPL(work_on_cpu);
 #endif /* CONFIG_SMP */
 
-void __init init_workqueues(void)
+#ifdef CONFIG_FREEZER
+
+/**
+ * freeze_workqueues_begin - begin freezing workqueues
+ *
+ * Start freezing workqueues.  After this function returns, all
+ * freezeable workqueues will queue new works to their frozen_works
+ * list instead of gcwq->worklist.
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock and gcwq->lock's.
+ */
+void freeze_workqueues_begin(void)
+{
+	unsigned int cpu;
+
+	spin_lock(&workqueue_lock);
+
+	BUG_ON(workqueue_freezing);
+	workqueue_freezing = true;
+
+	for_each_gcwq_cpu(cpu) {
+		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct workqueue_struct *wq;
+
+		spin_lock_irq(&gcwq->lock);
+
+		BUG_ON(gcwq->flags & GCWQ_FREEZING);
+		gcwq->flags |= GCWQ_FREEZING;
+
+		list_for_each_entry(wq, &workqueues, list) {
+			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+			if (cwq && wq->flags & WQ_FREEZEABLE)
+				cwq->max_active = 0;
+		}
+
+		spin_unlock_irq(&gcwq->lock);
+	}
+
+	spin_unlock(&workqueue_lock);
+}
+
+/**
+ * freeze_workqueues_busy - are freezeable workqueues still busy?
+ *
+ * Check whether freezing is complete.  This function must be called
+ * between freeze_workqueues_begin() and thaw_workqueues().
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock.
+ *
+ * RETURNS:
+ * %true if some freezeable workqueues are still busy.  %false if
+ * freezing is complete.
+ */
+bool freeze_workqueues_busy(void)
+{
+	unsigned int cpu;
+	bool busy = false;
+
+	spin_lock(&workqueue_lock);
+
+	BUG_ON(!workqueue_freezing);
+
+	for_each_gcwq_cpu(cpu) {
+		struct workqueue_struct *wq;
+		/*
+		 * nr_active is monotonically decreasing.  It's safe
+		 * to peek without lock.
+		 */
+		list_for_each_entry(wq, &workqueues, list) {
+			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+			if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+				continue;
+
+			BUG_ON(cwq->nr_active < 0);
+			if (cwq->nr_active) {
+				busy = true;
+				goto out_unlock;
+			}
+		}
+	}
+out_unlock:
+	spin_unlock(&workqueue_lock);
+	return busy;
+}
+
+/**
+ * thaw_workqueues - thaw workqueues
+ *
+ * Thaw workqueues.  Normal queueing is restored and all collected
+ * frozen works are transferred to their respective gcwq worklists.
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock and gcwq->lock's.
+ */
+void thaw_workqueues(void)
+{
+	unsigned int cpu;
+
+	spin_lock(&workqueue_lock);
+
+	if (!workqueue_freezing)
+		goto out_unlock;
+
+	for_each_gcwq_cpu(cpu) {
+		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct workqueue_struct *wq;
+
+		spin_lock_irq(&gcwq->lock);
+
+		BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
+		gcwq->flags &= ~GCWQ_FREEZING;
+
+		list_for_each_entry(wq, &workqueues, list) {
+			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+			if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+				continue;
+
+			/* restore max_active and repopulate worklist */
+			cwq->max_active = wq->saved_max_active;
+
+			while (!list_empty(&cwq->delayed_works) &&
+			       cwq->nr_active < cwq->max_active)
+				cwq_activate_first_delayed(cwq);
+		}
+
+		wake_up_worker(gcwq);
+
+		spin_unlock_irq(&gcwq->lock);
+	}
+
+	workqueue_freezing = false;
+out_unlock:
+	spin_unlock(&workqueue_lock);
+}
+#endif /* CONFIG_FREEZER */
+
+static int __init init_workqueues(void)
 {
-	alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL);
+	unsigned int cpu;
+	int i;
+
+	cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+
+	/* initialize gcwqs */
+	for_each_gcwq_cpu(cpu) {
+		struct global_cwq *gcwq = get_gcwq(cpu);
+
+		spin_lock_init(&gcwq->lock);
+		INIT_LIST_HEAD(&gcwq->worklist);
+		gcwq->cpu = cpu;
+		gcwq->flags |= GCWQ_DISASSOCIATED;
+
+		INIT_LIST_HEAD(&gcwq->idle_list);
+		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
+			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
+
+		init_timer_deferrable(&gcwq->idle_timer);
+		gcwq->idle_timer.function = idle_worker_timeout;
+		gcwq->idle_timer.data = (unsigned long)gcwq;
 
-	cpumask_copy(cpu_populated_map, cpu_online_mask);
-	singlethread_cpu = cpumask_first(cpu_possible_mask);
-	cpu_singlethread_map = cpumask_of(singlethread_cpu);
-	hotcpu_notifier(workqueue_cpu_callback, 0);
-	keventd_wq = create_workqueue("events");
-	BUG_ON(!keventd_wq);
+		setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
+			    (unsigned long)gcwq);
+
+		ida_init(&gcwq->worker_ida);
+
+		gcwq->trustee_state = TRUSTEE_DONE;
+		init_waitqueue_head(&gcwq->trustee_wait);
+	}
+
+	/* create the initial worker */
+	for_each_online_gcwq_cpu(cpu) {
+		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct worker *worker;
+
+		if (cpu != WORK_CPU_UNBOUND)
+			gcwq->flags &= ~GCWQ_DISASSOCIATED;
+		worker = create_worker(gcwq, true);
+		BUG_ON(!worker);
+		spin_lock_irq(&gcwq->lock);
+		start_worker(worker);
+		spin_unlock_irq(&gcwq->lock);
+	}
+
+	system_wq = alloc_workqueue("events", 0, 0);
+	system_long_wq = alloc_workqueue("events_long", 0, 0);
+	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
+	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
+					    WQ_UNBOUND_MAX_ACTIVE);
+	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
+	return 0;
 }
+early_initcall(init_workqueues);
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 00000000000..2d10fc98dc7
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,9 @@
+/*
+ * kernel/workqueue_sched.h
+ *
+ * Scheduler hooks for concurrency managed workqueue.  Only to be
+ * included from sched.c and workqueue.c.
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+				       unsigned int cpu);