46 files changed, 4607 insertions, 1054 deletions
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 7b4b564961d..cdaedad3afe 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -4,20 +4,31 @@
 
 CPPFLAGS_vmlinux.lds	:= -DTEXT_OFFSET=$(TEXT_OFFSET)
 AFLAGS_head.o		:= -DTEXT_OFFSET=$(TEXT_OFFSET)
+CFLAGS_efi-stub.o 	:= -DTEXT_OFFSET=$(TEXT_OFFSET) \
+			   -I$(src)/../../../scripts/dtc/libfdt
+
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_insn.o = -pg
+CFLAGS_REMOVE_return_address.o = -pg
 
 # Object file lists.
 arm64-obj-y		:= cputable.o debug-monitors.o entry.o irq.o fpsimd.o	\
 			   entry-fpsimd.o process.o ptrace.o setup.o signal.o	\
 			   sys.o stacktrace.o time.o traps.o io.o vdso.o	\
-			   hyp-stub.o psci.o
+			   hyp-stub.o psci.o cpu_ops.o insn.o return_address.o
 
 arm64-obj-$(CONFIG_COMPAT)		+= sys32.o kuser32.o signal32.o 	\
 					   sys_compat.o
+arm64-obj-$(CONFIG_FUNCTION_TRACER)	+= ftrace.o entry-ftrace.o
 arm64-obj-$(CONFIG_MODULES)		+= arm64ksyms.o module.o
-arm64-obj-$(CONFIG_SMP)			+= smp.o smp_spin_table.o smp_psci.o
+arm64-obj-$(CONFIG_SMP)			+= smp.o smp_spin_table.o topology.o
+arm64-obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 arm64-obj-$(CONFIG_HW_PERF_EVENTS)	+= perf_event.o
-arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT)+= hw_breakpoint.o
-arm64-obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
+arm64-obj-$(CONFIG_ARM64_CPU_SUSPEND)	+= sleep.o suspend.o
+arm64-obj-$(CONFIG_JUMP_LABEL)		+= jump_label.o
+arm64-obj-$(CONFIG_KGDB)		+= kgdb.o
+arm64-obj-$(CONFIG_EFI)			+= efi.o efi-stub.o efi-entry.o
 
 obj-y					+= $(arm64-obj-y) vdso/
 obj-m					+= $(arm64-obj-m)
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 41b4f626d55..a85843ddbde 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -29,16 +29,14 @@
 
 #include <asm/checksum.h>
 
-	/* user mem (segment) */
-EXPORT_SYMBOL(__strnlen_user);
-EXPORT_SYMBOL(__strncpy_from_user);
-
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
+	/* user mem (segment) */
 EXPORT_SYMBOL(__copy_from_user);
 EXPORT_SYMBOL(__copy_to_user);
 EXPORT_SYMBOL(__clear_user);
+EXPORT_SYMBOL(__copy_in_user);
 
 	/* physical memory */
 EXPORT_SYMBOL(memstart_addr);
@@ -46,10 +44,15 @@ EXPORT_SYMBOL(memstart_addr);
 	/* string / mem functions */
 EXPORT_SYMBOL(strchr);
 EXPORT_SYMBOL(strrchr);
+EXPORT_SYMBOL(strcmp);
+EXPORT_SYMBOL(strncmp);
+EXPORT_SYMBOL(strlen);
+EXPORT_SYMBOL(strnlen);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(memchr);
+EXPORT_SYMBOL(memcmp);
 
 	/* atomic bitops */
 EXPORT_SYMBOL(set_bit);
@@ -58,3 +61,7 @@ EXPORT_SYMBOL(clear_bit);
 EXPORT_SYMBOL(test_and_clear_bit);
 EXPORT_SYMBOL(change_bit);
 EXPORT_SYMBOL(test_and_change_bit);
+
+#ifdef CONFIG_FUNCTION_TRACER
+EXPORT_SYMBOL(_mcount);
+#endif
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 666e231d410..646f888387c 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -25,6 +25,8 @@
 #include <asm/thread_info.h>
 #include <asm/memory.h>
 #include <asm/cputable.h>
+#include <asm/smp_plat.h>
+#include <asm/suspend.h>
 #include <asm/vdso_datapage.h>
 #include <linux/kbuild.h>
 
@@ -138,5 +140,14 @@ int main(void)
   DEFINE(KVM_VTTBR,		offsetof(struct kvm, arch.vttbr));
   DEFINE(KVM_VGIC_VCTRL,	offsetof(struct kvm, arch.vgic.vctrl_base));
 #endif
+#ifdef CONFIG_ARM64_CPU_SUSPEND
+  DEFINE(CPU_SUSPEND_SZ,	sizeof(struct cpu_suspend_ctx));
+  DEFINE(CPU_CTX_SP,		offsetof(struct cpu_suspend_ctx, sp));
+  DEFINE(MPIDR_HASH_MASK,	offsetof(struct mpidr_hash, mask));
+  DEFINE(MPIDR_HASH_SHIFTS,	offsetof(struct mpidr_hash, shift_aff));
+  DEFINE(SLEEP_SAVE_SP_SZ,	sizeof(struct sleep_save_sp));
+  DEFINE(SLEEP_SAVE_SP_PHYS,	offsetof(struct sleep_save_sp, save_ptr_stash_phys));
+  DEFINE(SLEEP_SAVE_SP_VIRT,	offsetof(struct sleep_save_sp, save_ptr_stash));
+#endif
   return 0;
 }
diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c
new file mode 100644
index 00000000000..d62d12fb36c
--- /dev/null
+++ b/arch/arm64/kernel/cpu_ops.c
@@ -0,0 +1,87 @@
+/*
+ * CPU kernel entry/exit control
+ *
+ * Copyright (C) 2013 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <asm/cpu_ops.h>
+#include <asm/smp_plat.h>
+#include <linux/errno.h>
+#include <linux/of.h>
+#include <linux/string.h>
+
+extern const struct cpu_operations smp_spin_table_ops;
+extern const struct cpu_operations cpu_psci_ops;
+
+const struct cpu_operations *cpu_ops[NR_CPUS];
+
+static const struct cpu_operations *supported_cpu_ops[] __initconst = {
+#ifdef CONFIG_SMP
+	&smp_spin_table_ops,
+	&cpu_psci_ops,
+#endif
+	NULL,
+};
+
+static const struct cpu_operations * __init cpu_get_ops(const char *name)
+{
+	const struct cpu_operations **ops = supported_cpu_ops;
+
+	while (*ops) {
+		if (!strcmp(name, (*ops)->name))
+			return *ops;
+
+		ops++;
+	}
+
+	return NULL;
+}
+
+/*
+ * Read a cpu's enable method from the device tree and record it in cpu_ops.
+ */
+int __init cpu_read_ops(struct device_node *dn, int cpu)
+{
+	const char *enable_method = of_get_property(dn, "enable-method", NULL);
+	if (!enable_method) {
+		/*
+		 * The boot CPU may not have an enable method (e.g. when
+		 * spin-table is used for secondaries). Don't warn spuriously.
+		 */
+		if (cpu != 0)
+			pr_err("%s: missing enable-method property\n",
+				dn->full_name);
+		return -ENOENT;
+	}
+
+	cpu_ops[cpu] = cpu_get_ops(enable_method);
+	if (!cpu_ops[cpu]) {
+		pr_warn("%s: unsupported enable-method property: %s\n",
+			dn->full_name, enable_method);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+void __init cpu_read_bootcpu_ops(void)
+{
+	struct device_node *dn = of_get_cpu_node(0, NULL);
+	if (!dn) {
+		pr_err("Failed to find device node for boot cpu\n");
+		return;
+	}
+	cpu_read_ops(dn, 0);
+}
diff --git a/arch/arm64/kernel/cputable.c b/arch/arm64/kernel/cputable.c
index 63cfc4a43f4..fd3993cb060 100644
--- a/arch/arm64/kernel/cputable.c
+++ b/arch/arm64/kernel/cputable.c
@@ -22,7 +22,7 @@
 
 extern unsigned long __cpu_setup(void);
 
-struct cpu_info __initdata cpu_table[] = {
+struct cpu_info cpu_table[] = {
 	{
 		.cpu_id_val	= 0x000f0000,
 		.cpu_id_mask	= 0x000f0000,
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index cbfacf7fb43..a7fb874b595 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -27,7 +27,6 @@
 #include <linux/uaccess.h>
 
 #include <asm/debug-monitors.h>
-#include <asm/local.h>
 #include <asm/cputype.h>
 #include <asm/system_misc.h>
 
@@ -89,8 +88,8 @@ early_param("nodebugmon", early_debug_disable);
  * Keep track of debug users on each core.
  * The ref counts are per-cpu so we use a local_t type.
  */
-static DEFINE_PER_CPU(local_t, mde_ref_count);
-static DEFINE_PER_CPU(local_t, kde_ref_count);
+static DEFINE_PER_CPU(int, mde_ref_count);
+static DEFINE_PER_CPU(int, kde_ref_count);
 
 void enable_debug_monitors(enum debug_el el)
 {
@@ -98,11 +97,11 @@ void enable_debug_monitors(enum debug_el el)
 
 	WARN_ON(preemptible());
 
-	if (local_inc_return(&__get_cpu_var(mde_ref_count)) == 1)
+	if (this_cpu_inc_return(mde_ref_count) == 1)
 		enable = DBG_MDSCR_MDE;
 
 	if (el == DBG_ACTIVE_EL1 &&
-	    local_inc_return(&__get_cpu_var(kde_ref_count)) == 1)
+	    this_cpu_inc_return(kde_ref_count) == 1)
 		enable |= DBG_MDSCR_KDE;
 
 	if (enable && debug_enabled) {
@@ -118,11 +117,11 @@ void disable_debug_monitors(enum debug_el el)
 
 	WARN_ON(preemptible());
 
-	if (local_dec_and_test(&__get_cpu_var(mde_ref_count)))
+	if (this_cpu_dec_return(mde_ref_count) == 0)
 		disable = ~DBG_MDSCR_MDE;
 
 	if (el == DBG_ACTIVE_EL1 &&
-	    local_dec_and_test(&__get_cpu_var(kde_ref_count)))
+	    this_cpu_dec_return(kde_ref_count) == 0)
 		disable &= ~DBG_MDSCR_KDE;
 
 	if (disable) {
@@ -138,7 +137,6 @@ void disable_debug_monitors(enum debug_el el)
 static void clear_os_lock(void *unused)
 {
 	asm volatile("msr oslar_el1, %0" : : "r" (0));
-	isb();
 }
 
 static int os_lock_notify(struct notifier_block *self,
@@ -156,12 +154,17 @@ static struct notifier_block os_lock_nb = {
 
 static int debug_monitors_init(void)
 {
+	cpu_notifier_register_begin();
+
 	/* Clear the OS lock. */
-	smp_call_function(clear_os_lock, NULL, 1);
-	clear_os_lock(NULL);
+	on_each_cpu(clear_os_lock, NULL, 1);
+	isb();
+	local_dbg_enable();
 
 	/* Register hotplug handler. */
-	register_cpu_notifier(&os_lock_nb);
+	__register_cpu_notifier(&os_lock_nb);
+
+	cpu_notifier_register_done();
 	return 0;
 }
 postcore_initcall(debug_monitors_init);
@@ -188,6 +191,48 @@ static void clear_regs_spsr_ss(struct pt_regs *regs)
 	regs->pstate = spsr;
 }
 
+/* EL1 Single Step Handler hooks */
+static LIST_HEAD(step_hook);
+static DEFINE_RWLOCK(step_hook_lock);
+
+void register_step_hook(struct step_hook *hook)
+{
+	write_lock(&step_hook_lock);
+	list_add(&hook->node, &step_hook);
+	write_unlock(&step_hook_lock);
+}
+
+void unregister_step_hook(struct step_hook *hook)
+{
+	write_lock(&step_hook_lock);
+	list_del(&hook->node);
+	write_unlock(&step_hook_lock);
+}
+
+/*
+ * Call registered single step handers
+ * There is no Syndrome info to check for determining the handler.
+ * So we call all the registered handlers, until the right handler is
+ * found which returns zero.
+ */
+static int call_step_hook(struct pt_regs *regs, unsigned int esr)
+{
+	struct step_hook *hook;
+	int retval = DBG_HOOK_ERROR;
+
+	read_lock(&step_hook_lock);
+
+	list_for_each_entry(hook, &step_hook, node)	{
+		retval = hook->fn(regs, esr);
+		if (retval == DBG_HOOK_HANDLED)
+			break;
+	}
+
+	read_unlock(&step_hook_lock);
+
+	return retval;
+}
+
 static int single_step_handler(unsigned long addr, unsigned int esr,
 			       struct pt_regs *regs)
 {
@@ -215,7 +260,9 @@ static int single_step_handler(unsigned long addr, unsigned int esr,
 		 */
 		user_rewind_single_step(current);
 	} else {
-		/* TODO: route to KGDB */
+		if (call_step_hook(regs, esr) == DBG_HOOK_HANDLED)
+			return 0;
+
 		pr_warning("Unexpected kernel single-step exception at EL1\n");
 		/*
 		 * Re-enable stepping since we know that we will be
@@ -227,11 +274,50 @@ static int single_step_handler(unsigned long addr, unsigned int esr,
 	return 0;
 }
 
+/*
+ * Breakpoint handler is re-entrant as another breakpoint can
+ * hit within breakpoint handler, especically in kprobes.
+ * Use reader/writer locks instead of plain spinlock.
+ */
+static LIST_HEAD(break_hook);
+static DEFINE_RWLOCK(break_hook_lock);
+
+void register_break_hook(struct break_hook *hook)
+{
+	write_lock(&break_hook_lock);
+	list_add(&hook->node, &break_hook);
+	write_unlock(&break_hook_lock);
+}
+
+void unregister_break_hook(struct break_hook *hook)
+{
+	write_lock(&break_hook_lock);
+	list_del(&hook->node);
+	write_unlock(&break_hook_lock);
+}
+
+static int call_break_hook(struct pt_regs *regs, unsigned int esr)
+{
+	struct break_hook *hook;
+	int (*fn)(struct pt_regs *regs, unsigned int esr) = NULL;
+
+	read_lock(&break_hook_lock);
+	list_for_each_entry(hook, &break_hook, node)
+		if ((esr & hook->esr_mask) == hook->esr_val)
+			fn = hook->fn;
+	read_unlock(&break_hook_lock);
+
+	return fn ? fn(regs, esr) : DBG_HOOK_ERROR;
+}
+
 static int brk_handler(unsigned long addr, unsigned int esr,
 		       struct pt_regs *regs)
 {
 	siginfo_t info;
 
+	if (call_break_hook(regs, esr) == DBG_HOOK_HANDLED)
+		return 0;
+
 	if (!user_mode(regs))
 		return -EFAULT;
 
@@ -249,7 +335,8 @@ static int brk_handler(unsigned long addr, unsigned int esr,
 int aarch32_break_handler(struct pt_regs *regs)
 {
 	siginfo_t info;
-	unsigned int instr;
+	u32 arm_instr;
+	u16 thumb_instr;
 	bool bp = false;
 	void __user *pc = (void __user *)instruction_pointer(regs);
 
@@ -258,18 +345,21 @@ int aarch32_break_handler(struct pt_regs *regs)
 
 	if (compat_thumb_mode(regs)) {
 		/* get 16-bit Thumb instruction */
-		get_user(instr, (u16 __user *)pc);
-		if (instr == AARCH32_BREAK_THUMB2_LO) {
+		get_user(thumb_instr, (u16 __user *)pc);
+		thumb_instr = le16_to_cpu(thumb_instr);
+		if (thumb_instr == AARCH32_BREAK_THUMB2_LO) {
 			/* get second half of 32-bit Thumb-2 instruction */
-			get_user(instr, (u16 __user *)(pc + 2));
-			bp = instr == AARCH32_BREAK_THUMB2_HI;
+			get_user(thumb_instr, (u16 __user *)(pc + 2));
+			thumb_instr = le16_to_cpu(thumb_instr);
+			bp = thumb_instr == AARCH32_BREAK_THUMB2_HI;
 		} else {
-			bp = instr == AARCH32_BREAK_THUMB;
+			bp = thumb_instr == AARCH32_BREAK_THUMB;
 		}
 	} else {
 		/* 32-bit ARM instruction */
-		get_user(instr, (u32 __user *)pc);
-		bp = (instr & ~0xf0000000) == AARCH32_BREAK_ARM;
+		get_user(arm_instr, (u32 __user *)pc);
+		arm_instr = le32_to_cpu(arm_instr);
+		bp = (arm_instr & ~0xf0000000) == AARCH32_BREAK_ARM;
 	}
 
 	if (!bp)
diff --git a/arch/arm64/kernel/early_printk.c b/arch/arm64/kernel/early_printk.c
deleted file mode 100644
index fbb6e184365..00000000000
--- a/arch/arm64/kernel/early_printk.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Earlyprintk support.
- *
- * Copyright (C) 2012 ARM Ltd.
- * Author: Catalin Marinas <catalin.marinas@arm.com>
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#include <linux/kernel.h>
-#include <linux/console.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/io.h>
-
-#include <linux/amba/serial.h>
-#include <linux/serial_reg.h>
-
-static void __iomem *early_base;
-static void (*printch)(char ch);
-
-/*
- * PL011 single character TX.
- */
-static void pl011_printch(char ch)
-{
-	while (readl_relaxed(early_base + UART01x_FR) & UART01x_FR_TXFF)
-		;
-	writeb_relaxed(ch, early_base + UART01x_DR);
-	while (readl_relaxed(early_base + UART01x_FR) & UART01x_FR_BUSY)
-		;
-}
-
-/*
- * Semihosting-based debug console
- */
-static void smh_printch(char ch)
-{
-	asm volatile("mov  x1, %0\n"
-		     "mov  x0, #3\n"
-		     "hlt  0xf000\n"
-		     : : "r" (&ch) : "x0", "x1", "memory");
-}
-
-/*
- * 8250/16550 (8-bit aligned registers) single character TX.
- */
-static void uart8250_8bit_printch(char ch)
-{
-	while (!(readb_relaxed(early_base + UART_LSR) & UART_LSR_THRE))
-		;
-	writeb_relaxed(ch, early_base + UART_TX);
-}
-
-/*
- * 8250/16550 (32-bit aligned registers) single character TX.
- */
-static void uart8250_32bit_printch(char ch)
-{
-	while (!(readl_relaxed(early_base + (UART_LSR << 2)) & UART_LSR_THRE))
-		;
-	writel_relaxed(ch, early_base + (UART_TX << 2));
-}
-
-struct earlycon_match {
-	const char *name;
-	void (*printch)(char ch);
-};
-
-static const struct earlycon_match earlycon_match[] __initconst = {
-	{ .name = "pl011", .printch = pl011_printch, },
-	{ .name = "smh", .printch = smh_printch, },
-	{ .name = "uart8250-8bit", .printch = uart8250_8bit_printch, },
-	{ .name = "uart8250-32bit", .printch = uart8250_32bit_printch, },
-	{}
-};
-
-static void early_write(struct console *con, const char *s, unsigned n)
-{
-	while (n-- > 0) {
-		if (*s == '\n')
-			printch('\r');
-		printch(*s);
-		s++;
-	}
-}
-
-static struct console early_console_dev = {
-	.name =		"earlycon",
-	.write =	early_write,
-	.flags =	CON_PRINTBUFFER | CON_BOOT,
-	.index =	-1,
-};
-
-/*
- * Parse earlyprintk=... parameter in the format:
- *
- *   <name>[,<addr>][,<options>]
- *
- * and register the early console. It is assumed that the UART has been
- * initialised by the bootloader already.
- */
-static int __init setup_early_printk(char *buf)
-{
-	const struct earlycon_match *match = earlycon_match;
-	phys_addr_t paddr = 0;
-
-	if (!buf) {
-		pr_warning("No earlyprintk arguments passed.\n");
-		return 0;
-	}
-
-	while (match->name) {
-		size_t len = strlen(match->name);
-		if (!strncmp(buf, match->name, len)) {
-			buf += len;
-			break;
-		}
-		match++;
-	}
-	if (!match->name) {
-		pr_warning("Unknown earlyprintk arguments: %s\n", buf);
-		return 0;
-	}
-
-	/* I/O address */
-	if (!strncmp(buf, ",0x", 3)) {
-		char *e;
-		paddr = simple_strtoul(buf + 1, &e, 16);
-		buf = e;
-	}
-	/* no options parsing yet */
-
-	if (paddr)
-		early_base = early_io_map(paddr, EARLYCON_IOBASE);
-
-	printch = match->printch;
-	early_console = &early_console_dev;
-	register_console(&early_console_dev);
-
-	return 0;
-}
-
-early_param("earlyprintk", setup_early_printk);
diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S
new file mode 100644
index 00000000000..619b1dd7bcd
--- /dev/null
+++ b/arch/arm64/kernel/efi-entry.S
@@ -0,0 +1,108 @@
+/*
+ * EFI entry point.
+ *
+ * Copyright (C) 2013, 2014 Red Hat, Inc.
+ * Author: Mark Salter <msalter@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/linkage.h>
+#include <linux/init.h>
+
+#include <asm/assembler.h>
+
+#define EFI_LOAD_ERROR 0x8000000000000001
+
+	__INIT
+
+	/*
+	 * We arrive here from the EFI boot manager with:
+	 *
+	 *    * CPU in little-endian mode
+	 *    * MMU on with identity-mapped RAM
+	 *    * Icache and Dcache on
+	 *
+	 * We will most likely be running from some place other than where
+	 * we want to be. The kernel image wants to be placed at TEXT_OFFSET
+	 * from start of RAM.
+	 */
+ENTRY(efi_stub_entry)
+	/*
+	 * Create a stack frame to save FP/LR with extra space
+	 * for image_addr variable passed to efi_entry().
+	 */
+	stp	x29, x30, [sp, #-32]!
+
+	/*
+	 * Call efi_entry to do the real work.
+	 * x0 and x1 are already set up by firmware. Current runtime
+	 * address of image is calculated and passed via *image_addr.
+	 *
+	 * unsigned long efi_entry(void *handle,
+	 *                         efi_system_table_t *sys_table,
+	 *                         unsigned long *image_addr) ;
+	 */
+	adrp	x8, _text
+	add	x8, x8, #:lo12:_text
+	add	x2, sp, 16
+	str	x8, [x2]
+	bl	efi_entry
+	cmn	x0, #1
+	b.eq	efi_load_fail
+
+	/*
+	 * efi_entry() will have relocated the kernel image if necessary
+	 * and we return here with device tree address in x0 and the kernel
+	 * entry point stored at *image_addr. Save those values in registers
+	 * which are callee preserved.
+	 */
+	mov	x20, x0		// DTB address
+	ldr	x0, [sp, #16]	// relocated _text address
+	mov	x21, x0
+
+	/*
+	 * Flush dcache covering current runtime addresses
+	 * of kernel text/data. Then flush all of icache.
+	 */
+	adrp	x1, _text
+	add	x1, x1, #:lo12:_text
+	adrp	x2, _edata
+	add	x2, x2, #:lo12:_edata
+	sub	x1, x2, x1
+
+	bl	__flush_dcache_area
+	ic	ialluis
+
+	/* Turn off Dcache and MMU */
+	mrs	x0, CurrentEL
+	cmp	x0, #CurrentEL_EL2
+	b.ne	1f
+	mrs	x0, sctlr_el2
+	bic	x0, x0, #1 << 0	// clear SCTLR.M
+	bic	x0, x0, #1 << 2	// clear SCTLR.C
+	msr	sctlr_el2, x0
+	isb
+	b	2f
+1:
+	mrs	x0, sctlr_el1
+	bic	x0, x0, #1 << 0	// clear SCTLR.M
+	bic	x0, x0, #1 << 2	// clear SCTLR.C
+	msr	sctlr_el1, x0
+	isb
+2:
+	/* Jump to kernel entry point */
+	mov	x0, x20
+	mov	x1, xzr
+	mov	x2, xzr
+	mov	x3, xzr
+	br	x21
+
+efi_load_fail:
+	mov	x0, #EFI_LOAD_ERROR
+	ldp	x29, x30, [sp], #32
+	ret
+
+ENDPROC(efi_stub_entry)
diff --git a/arch/arm64/kernel/efi-stub.c b/arch/arm64/kernel/efi-stub.c
new file mode 100644
index 00000000000..e786e6cdc40
--- /dev/null
+++ b/arch/arm64/kernel/efi-stub.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2013, 2014 Linaro Ltd;  <roy.franz@linaro.org>
+ *
+ * This file implements the EFI boot stub for the arm64 kernel.
+ * Adapted from ARM version by Mark Salter <msalter@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/efi.h>
+#include <linux/libfdt.h>
+#include <asm/sections.h>
+
+/*
+ * AArch64 requires the DTB to be 8-byte aligned in the first 512MiB from
+ * start of kernel and may not cross a 2MiB boundary. We set alignment to
+ * 2MiB so we know it won't cross a 2MiB boundary.
+ */
+#define EFI_FDT_ALIGN	SZ_2M   /* used by allocate_new_fdt_and_exit_boot() */
+#define MAX_FDT_OFFSET	SZ_512M
+
+#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__)
+
+static void efi_char16_printk(efi_system_table_t *sys_table_arg,
+			      efi_char16_t *str);
+
+static efi_status_t efi_open_volume(efi_system_table_t *sys_table,
+				    void *__image, void **__fh);
+static efi_status_t efi_file_close(void *handle);
+
+static efi_status_t
+efi_file_read(void *handle, unsigned long *size, void *addr);
+
+static efi_status_t
+efi_file_size(efi_system_table_t *sys_table, void *__fh,
+	      efi_char16_t *filename_16, void **handle, u64 *file_sz);
+
+/* Include shared EFI stub code */
+#include "../../../drivers/firmware/efi/efi-stub-helper.c"
+#include "../../../drivers/firmware/efi/fdt.c"
+#include "../../../drivers/firmware/efi/arm-stub.c"
+
+
+static efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
+					unsigned long *image_addr,
+					unsigned long *image_size,
+					unsigned long *reserve_addr,
+					unsigned long *reserve_size,
+					unsigned long dram_base,
+					efi_loaded_image_t *image)
+{
+	efi_status_t status;
+	unsigned long kernel_size, kernel_memsize = 0;
+
+	/* Relocate the image, if required. */
+	kernel_size = _edata - _text;
+	if (*image_addr != (dram_base + TEXT_OFFSET)) {
+		kernel_memsize = kernel_size + (_end - _edata);
+		status = efi_relocate_kernel(sys_table, image_addr,
+					     kernel_size, kernel_memsize,
+					     dram_base + TEXT_OFFSET,
+					     PAGE_SIZE);
+		if (status != EFI_SUCCESS) {
+			pr_efi_err(sys_table, "Failed to relocate kernel\n");
+			return status;
+		}
+		if (*image_addr != (dram_base + TEXT_OFFSET)) {
+			pr_efi_err(sys_table, "Failed to alloc kernel memory\n");
+			efi_free(sys_table, kernel_memsize, *image_addr);
+			return EFI_ERROR;
+		}
+		*image_size = kernel_memsize;
+	}
+
+
+	return EFI_SUCCESS;
+}
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
new file mode 100644
index 00000000000..14db1f6e8d7
--- /dev/null
+++ b/arch/arm64/kernel/efi.c
@@ -0,0 +1,469 @@
+/*
+ * Extensible Firmware Interface
+ *
+ * Based on Extensible Firmware Interface Specification version 2.4
+ *
+ * Copyright (C) 2013, 2014 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/efi.h>
+#include <linux/export.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/cacheflush.h>
+#include <asm/efi.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+
+struct efi_memory_map memmap;
+
+static efi_runtime_services_t *runtime;
+
+static u64 efi_system_table;
+
+static int uefi_debug __initdata;
+static int __init uefi_debug_setup(char *str)
+{
+	uefi_debug = 1;
+
+	return 0;
+}
+early_param("uefi_debug", uefi_debug_setup);
+
+static int __init is_normal_ram(efi_memory_desc_t *md)
+{
+	if (md->attribute & EFI_MEMORY_WB)
+		return 1;
+	return 0;
+}
+
+static void __init efi_setup_idmap(void)
+{
+	struct memblock_region *r;
+	efi_memory_desc_t *md;
+	u64 paddr, npages, size;
+
+	for_each_memblock(memory, r)
+		create_id_mapping(r->base, r->size, 0);
+
+	/* map runtime io spaces */
+	for_each_efi_memory_desc(&memmap, md) {
+		if (!(md->attribute & EFI_MEMORY_RUNTIME) || is_normal_ram(md))
+			continue;
+		paddr = md->phys_addr;
+		npages = md->num_pages;
+		memrange_efi_to_native(&paddr, &npages);
+		size = npages << PAGE_SHIFT;
+		create_id_mapping(paddr, size, 1);
+	}
+}
+
+static int __init uefi_init(void)
+{
+	efi_char16_t *c16;
+	char vendor[100] = "unknown";
+	int i, retval;
+
+	efi.systab = early_memremap(efi_system_table,
+				    sizeof(efi_system_table_t));
+	if (efi.systab == NULL) {
+		pr_warn("Unable to map EFI system table.\n");
+		return -ENOMEM;
+	}
+
+	set_bit(EFI_BOOT, &efi.flags);
+	set_bit(EFI_64BIT, &efi.flags);
+
+	/*
+	 * Verify the EFI Table
+	 */
+	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) {
+		pr_err("System table signature incorrect\n");
+		return -EINVAL;
+	}
+	if ((efi.systab->hdr.revision >> 16) < 2)
+		pr_warn("Warning: EFI system table version %d.%02d, expected 2.00 or greater\n",
+			efi.systab->hdr.revision >> 16,
+			efi.systab->hdr.revision & 0xffff);
+
+	/* Show what we know for posterity */
+	c16 = early_memremap(efi.systab->fw_vendor,
+			     sizeof(vendor));
+	if (c16) {
+		for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i)
+			vendor[i] = c16[i];
+		vendor[i] = '\0';
+	}
+
+	pr_info("EFI v%u.%.02u by %s\n",
+		efi.systab->hdr.revision >> 16,
+		efi.systab->hdr.revision & 0xffff, vendor);
+
+	retval = efi_config_init(NULL);
+	if (retval == 0)
+		set_bit(EFI_CONFIG_TABLES, &efi.flags);
+
+	early_memunmap(c16, sizeof(vendor));
+	early_memunmap(efi.systab,  sizeof(efi_system_table_t));
+
+	return retval;
+}
+
+static __initdata char memory_type_name[][32] = {
+	{"Reserved"},
+	{"Loader Code"},
+	{"Loader Data"},
+	{"Boot Code"},
+	{"Boot Data"},
+	{"Runtime Code"},
+	{"Runtime Data"},
+	{"Conventional Memory"},
+	{"Unusable Memory"},
+	{"ACPI Reclaim Memory"},
+	{"ACPI Memory NVS"},
+	{"Memory Mapped I/O"},
+	{"MMIO Port Space"},
+	{"PAL Code"},
+};
+
+/*
+ * Return true for RAM regions we want to permanently reserve.
+ */
+static __init int is_reserve_region(efi_memory_desc_t *md)
+{
+	if (!is_normal_ram(md))
+		return 0;
+
+	if (md->attribute & EFI_MEMORY_RUNTIME)
+		return 1;
+
+	if (md->type == EFI_ACPI_RECLAIM_MEMORY ||
+	    md->type == EFI_RESERVED_TYPE)
+		return 1;
+
+	return 0;
+}
+
+static __init void reserve_regions(void)
+{
+	efi_memory_desc_t *md;
+	u64 paddr, npages, size;
+
+	if (uefi_debug)
+		pr_info("Processing EFI memory map:\n");
+
+	for_each_efi_memory_desc(&memmap, md) {
+		paddr = md->phys_addr;
+		npages = md->num_pages;
+
+		if (uefi_debug)
+			pr_info("  0x%012llx-0x%012llx [%s]",
+				paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1,
+				memory_type_name[md->type]);
+
+		memrange_efi_to_native(&paddr, &npages);
+		size = npages << PAGE_SHIFT;
+
+		if (is_normal_ram(md))
+			early_init_dt_add_memory_arch(paddr, size);
+
+		if (is_reserve_region(md) ||
+		    md->type == EFI_BOOT_SERVICES_CODE ||
+		    md->type == EFI_BOOT_SERVICES_DATA) {
+			memblock_reserve(paddr, size);
+			if (uefi_debug)
+				pr_cont("*");
+		}
+
+		if (uefi_debug)
+			pr_cont("\n");
+	}
+}
+
+
+static u64 __init free_one_region(u64 start, u64 end)
+{
+	u64 size = end - start;
+
+	if (uefi_debug)
+		pr_info("  EFI freeing: 0x%012llx-0x%012llx\n",	start, end - 1);
+
+	free_bootmem_late(start, size);
+	return size;
+}
+
+static u64 __init free_region(u64 start, u64 end)
+{
+	u64 map_start, map_end, total = 0;
+
+	if (end <= start)
+		return total;
+
+	map_start = (u64)memmap.phys_map;
+	map_end = PAGE_ALIGN(map_start + (memmap.map_end - memmap.map));
+	map_start &= PAGE_MASK;
+
+	if (start < map_end && end > map_start) {
+		/* region overlaps UEFI memmap */
+		if (start < map_start)
+			total += free_one_region(start, map_start);
+
+		if (map_end < end)
+			total += free_one_region(map_end, end);
+	} else
+		total += free_one_region(start, end);
+
+	return total;
+}
+
+static void __init free_boot_services(void)
+{
+	u64 total_freed = 0;
+	u64 keep_end, free_start, free_end;
+	efi_memory_desc_t *md;
+
+	/*
+	 * If kernel uses larger pages than UEFI, we have to be careful
+	 * not to inadvertantly free memory we want to keep if there is
+	 * overlap at the kernel page size alignment. We do not want to
+	 * free is_reserve_region() memory nor the UEFI memmap itself.
+	 *
+	 * The memory map is sorted, so we keep track of the end of
+	 * any previous region we want to keep, remember any region
+	 * we want to free and defer freeing it until we encounter
+	 * the next region we want to keep. This way, before freeing
+	 * it, we can clip it as needed to avoid freeing memory we
+	 * want to keep for UEFI.
+	 */
+
+	keep_end = 0;
+	free_start = 0;
+
+	for_each_efi_memory_desc(&memmap, md) {
+		u64 paddr, npages, size;
+
+		if (is_reserve_region(md)) {
+			/*
+			 * We don't want to free any memory from this region.
+			 */
+			if (free_start) {
+				/* adjust free_end then free region */
+				if (free_end > md->phys_addr)
+					free_end -= PAGE_SIZE;
+				total_freed += free_region(free_start, free_end);
+				free_start = 0;
+			}
+			keep_end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
+			continue;
+		}
+
+		if (md->type != EFI_BOOT_SERVICES_CODE &&
+		    md->type != EFI_BOOT_SERVICES_DATA) {
+			/* no need to free this region */
+			continue;
+		}
+
+		/*
+		 * We want to free memory from this region.
+		 */
+		paddr = md->phys_addr;
+		npages = md->num_pages;
+		memrange_efi_to_native(&paddr, &npages);
+		size = npages << PAGE_SHIFT;
+
+		if (free_start) {
+			if (paddr <= free_end)
+				free_end = paddr + size;
+			else {
+				total_freed += free_region(free_start, free_end);
+				free_start = paddr;
+				free_end = paddr + size;
+			}
+		} else {
+			free_start = paddr;
+			free_end = paddr + size;
+		}
+		if (free_start < keep_end) {
+			free_start += PAGE_SIZE;
+			if (free_start >= free_end)
+				free_start = 0;
+		}
+	}
+	if (free_start)
+		total_freed += free_region(free_start, free_end);
+
+	if (total_freed)
+		pr_info("Freed 0x%llx bytes of EFI boot services memory",
+			total_freed);
+}
+
+void __init efi_init(void)
+{
+	struct efi_fdt_params params;
+
+	/* Grab UEFI information placed in FDT by stub */
+	if (!efi_get_fdt_params(&params, uefi_debug))
+		return;
+
+	efi_system_table = params.system_table;
+
+	memblock_reserve(params.mmap & PAGE_MASK,
+			 PAGE_ALIGN(params.mmap_size + (params.mmap & ~PAGE_MASK)));
+	memmap.phys_map = (void *)params.mmap;
+	memmap.map = early_memremap(params.mmap, params.mmap_size);
+	memmap.map_end = memmap.map + params.mmap_size;
+	memmap.desc_size = params.desc_size;
+	memmap.desc_version = params.desc_ver;
+
+	if (uefi_init() < 0)
+		return;
+
+	reserve_regions();
+}
+
+void __init efi_idmap_init(void)
+{
+	if (!efi_enabled(EFI_BOOT))
+		return;
+
+	/* boot time idmap_pg_dir is incomplete, so fill in missing parts */
+	efi_setup_idmap();
+}
+
+static int __init remap_region(efi_memory_desc_t *md, void **new)
+{
+	u64 paddr, vaddr, npages, size;
+
+	paddr = md->phys_addr;
+	npages = md->num_pages;
+	memrange_efi_to_native(&paddr, &npages);
+	size = npages << PAGE_SHIFT;
+
+	if (is_normal_ram(md))
+		vaddr = (__force u64)ioremap_cache(paddr, size);
+	else
+		vaddr = (__force u64)ioremap(paddr, size);
+
+	if (!vaddr) {
+		pr_err("Unable to remap 0x%llx pages @ %p\n",
+		       npages, (void *)paddr);
+		return 0;
+	}
+
+	/* adjust for any rounding when EFI and system pagesize differs */
+	md->virt_addr = vaddr + (md->phys_addr - paddr);
+
+	if (uefi_debug)
+		pr_info("  EFI remap 0x%012llx => %p\n",
+			md->phys_addr, (void *)md->virt_addr);
+
+	memcpy(*new, md, memmap.desc_size);
+	*new += memmap.desc_size;
+
+	return 1;
+}
+
+/*
+ * Switch UEFI from an identity map to a kernel virtual map
+ */
+static int __init arm64_enter_virtual_mode(void)
+{
+	efi_memory_desc_t *md;
+	phys_addr_t virtmap_phys;
+	void *virtmap, *virt_md;
+	efi_status_t status;
+	u64 mapsize;
+	int count = 0;
+	unsigned long flags;
+
+	if (!efi_enabled(EFI_BOOT)) {
+		pr_info("EFI services will not be available.\n");
+		return -1;
+	}
+
+	pr_info("Remapping and enabling EFI services.\n");
+
+	/* replace early memmap mapping with permanent mapping */
+	mapsize = memmap.map_end - memmap.map;
+	early_memunmap(memmap.map, mapsize);
+	memmap.map = (__force void *)ioremap_cache((phys_addr_t)memmap.phys_map,
+						   mapsize);
+	memmap.map_end = memmap.map + mapsize;
+
+	efi.memmap = &memmap;
+
+	/* Map the runtime regions */
+	virtmap = kmalloc(mapsize, GFP_KERNEL);
+	if (!virtmap) {
+		pr_err("Failed to allocate EFI virtual memmap\n");
+		return -1;
+	}
+	virtmap_phys = virt_to_phys(virtmap);
+	virt_md = virtmap;
+
+	for_each_efi_memory_desc(&memmap, md) {
+		if (!(md->attribute & EFI_MEMORY_RUNTIME))
+			continue;
+		if (remap_region(md, &virt_md))
+			++count;
+	}
+
+	efi.systab = (__force void *)efi_lookup_mapped_addr(efi_system_table);
+	if (efi.systab)
+		set_bit(EFI_SYSTEM_TABLES, &efi.flags);
+
+	local_irq_save(flags);
+	cpu_switch_mm(idmap_pg_dir, &init_mm);
+
+	/* Call SetVirtualAddressMap with the physical address of the map */
+	runtime = efi.systab->runtime;
+	efi.set_virtual_address_map = runtime->set_virtual_address_map;
+
+	status = efi.set_virtual_address_map(count * memmap.desc_size,
+					     memmap.desc_size,
+					     memmap.desc_version,
+					     (efi_memory_desc_t *)virtmap_phys);
+	cpu_set_reserved_ttbr0();
+	flush_tlb_all();
+	local_irq_restore(flags);
+
+	kfree(virtmap);
+
+	free_boot_services();
+
+	if (status != EFI_SUCCESS) {
+		pr_err("Failed to set EFI virtual address map! [%lx]\n",
+			status);
+		return -1;
+	}
+
+	/* Set up runtime services function pointers */
+	runtime = efi.systab->runtime;
+	efi.get_time = runtime->get_time;
+	efi.set_time = runtime->set_time;
+	efi.get_wakeup_time = runtime->get_wakeup_time;
+	efi.set_wakeup_time = runtime->set_wakeup_time;
+	efi.get_variable = runtime->get_variable;
+	efi.get_next_variable = runtime->get_next_variable;
+	efi.set_variable = runtime->set_variable;
+	efi.query_variable_info = runtime->query_variable_info;
+	efi.update_capsule = runtime->update_capsule;
+	efi.query_capsule_caps = runtime->query_capsule_caps;
+	efi.get_next_high_mono_count = runtime->get_next_high_mono_count;
+	efi.reset_system = runtime->reset_system;
+
+	set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+
+	return 0;
+}
+early_initcall(arm64_enter_virtual_mode);
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 6a27cd6dbfa..d358ccacfc0 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state)
 	fpsimd_restore x0, 8
 	ret
 ENDPROC(fpsimd_load_state)
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+
+/*
+ * Save the bottom n FP registers.
+ *
+ * x0 - pointer to struct fpsimd_partial_state
+ */
+ENTRY(fpsimd_save_partial_state)
+	fpsimd_save_partial x0, 1, 8, 9
+	ret
+ENDPROC(fpsimd_load_partial_state)
+
+/*
+ * Load the bottom n FP registers.
+ *
+ * x0 - pointer to struct fpsimd_partial_state
+ */
+ENTRY(fpsimd_load_partial_state)
+	fpsimd_restore_partial x0, 8, 9
+	ret
+ENDPROC(fpsimd_load_partial_state)
+
+#endif
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
new file mode 100644
index 00000000000..aa5f9fcbf9e
--- /dev/null
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -0,0 +1,218 @@
+/*
+ * arch/arm64/kernel/entry-ftrace.S
+ *
+ * Copyright (C) 2013 Linaro Limited
+ * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/ftrace.h>
+#include <asm/insn.h>
+
+/*
+ * Gcc with -pg will put the following code in the beginning of each function:
+ *      mov x0, x30
+ *      bl _mcount
+ *	[function's body ...]
+ * "bl _mcount" may be replaced to "bl ftrace_caller" or NOP if dynamic
+ * ftrace is enabled.
+ *
+ * Please note that x0 as an argument will not be used here because we can
+ * get lr(x30) of instrumented function at any time by winding up call stack
+ * as long as the kernel is compiled without -fomit-frame-pointer.
+ * (or CONFIG_FRAME_POINTER, this is forced on arm64)
+ *
+ * stack layout after mcount_enter in _mcount():
+ *
+ * current sp/fp =>  0:+-----+
+ * in _mcount()        | x29 | -> instrumented function's fp
+ *                     +-----+
+ *                     | x30 | -> _mcount()'s lr (= instrumented function's pc)
+ * old sp       => +16:+-----+
+ * when instrumented   |     |
+ * function calls      | ... |
+ * _mcount()           |     |
+ *                     |     |
+ * instrumented => +xx:+-----+
+ * function's fp       | x29 | -> parent's fp
+ *                     +-----+
+ *                     | x30 | -> instrumented function's lr (= parent's pc)
+ *                     +-----+
+ *                     | ... |
+ */
+
+	.macro mcount_enter
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	.endm
+
+	.macro mcount_exit
+	ldp	x29, x30, [sp], #16
+	ret
+	.endm
+
+	.macro mcount_adjust_addr rd, rn
+	sub	\rd, \rn, #AARCH64_INSN_SIZE
+	.endm
+
+	/* for instrumented function's parent */
+	.macro mcount_get_parent_fp reg
+	ldr	\reg, [x29]
+	ldr	\reg, [\reg]
+	.endm
+
+	/* for instrumented function */
+	.macro mcount_get_pc0 reg
+	mcount_adjust_addr	\reg, x30
+	.endm
+
+	.macro mcount_get_pc reg
+	ldr	\reg, [x29, #8]
+	mcount_adjust_addr	\reg, \reg
+	.endm
+
+	.macro mcount_get_lr reg
+	ldr	\reg, [x29]
+	ldr	\reg, [\reg, #8]
+	mcount_adjust_addr	\reg, \reg
+	.endm
+
+	.macro mcount_get_lr_addr reg
+	ldr	\reg, [x29]
+	add	\reg, \reg, #8
+	.endm
+
+#ifndef CONFIG_DYNAMIC_FTRACE
+/*
+ * void _mcount(unsigned long return_address)
+ * @return_address: return address to instrumented function
+ *
+ * This function makes calls, if enabled, to:
+ *     - tracer function to probe instrumented function's entry,
+ *     - ftrace_graph_caller to set up an exit hook
+ */
+ENTRY(_mcount)
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	ldr	x0, =ftrace_trace_stop
+	ldr	x0, [x0]		// if ftrace_trace_stop
+	ret				//   return;
+#endif
+	mcount_enter
+
+	ldr	x0, =ftrace_trace_function
+	ldr	x2, [x0]
+	adr	x0, ftrace_stub
+	cmp	x0, x2			// if (ftrace_trace_function
+	b.eq	skip_ftrace_call	//     != ftrace_stub) {
+
+	mcount_get_pc	x0		//       function's pc
+	mcount_get_lr	x1		//       function's lr (= parent's pc)
+	blr	x2			//   (*ftrace_trace_function)(pc, lr);
+
+#ifndef CONFIG_FUNCTION_GRAPH_TRACER
+skip_ftrace_call:			//   return;
+	mcount_exit			// }
+#else
+	mcount_exit			//   return;
+					// }
+skip_ftrace_call:
+	ldr	x1, =ftrace_graph_return
+	ldr	x2, [x1]		//   if ((ftrace_graph_return
+	cmp	x0, x2			//        != ftrace_stub)
+	b.ne	ftrace_graph_caller
+
+	ldr	x1, =ftrace_graph_entry	//     || (ftrace_graph_entry
+	ldr	x2, [x1]		//        != ftrace_graph_entry_stub))
+	ldr	x0, =ftrace_graph_entry_stub
+	cmp	x0, x2
+	b.ne	ftrace_graph_caller	//     ftrace_graph_caller();
+
+	mcount_exit
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+ENDPROC(_mcount)
+
+#else /* CONFIG_DYNAMIC_FTRACE */
+/*
+ * _mcount() is used to build the kernel with -pg option, but all the branch
+ * instructions to _mcount() are replaced to NOP initially at kernel start up,
+ * and later on, NOP to branch to ftrace_caller() when enabled or branch to
+ * NOP when disabled per-function base.
+ */
+ENTRY(_mcount)
+	ret
+ENDPROC(_mcount)
+
+/*
+ * void ftrace_caller(unsigned long return_address)
+ * @return_address: return address to instrumented function
+ *
+ * This function is a counterpart of _mcount() in 'static' ftrace, and
+ * makes calls to:
+ *     - tracer function to probe instrumented function's entry,
+ *     - ftrace_graph_caller to set up an exit hook
+ */
+ENTRY(ftrace_caller)
+	mcount_enter
+
+	mcount_get_pc0	x0		//     function's pc
+	mcount_get_lr	x1		//     function's lr
+
+	.global ftrace_call
+ftrace_call:				// tracer(pc, lr);
+	nop				// This will be replaced with "bl xxx"
+					// where xxx can be any kind of tracer.
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	.global ftrace_graph_call
+ftrace_graph_call:			// ftrace_graph_caller();
+	nop				// If enabled, this will be replaced
+					// "b ftrace_graph_caller"
+#endif
+
+	mcount_exit
+ENDPROC(ftrace_caller)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(ftrace_stub)
+	ret
+ENDPROC(ftrace_stub)
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/*
+ * void ftrace_graph_caller(void)
+ *
+ * Called from _mcount() or ftrace_caller() when function_graph tracer is
+ * selected.
+ * This function w/ prepare_ftrace_return() fakes link register's value on
+ * the call stack in order to intercept instrumented function's return path
+ * and run return_to_handler() later on its exit.
+ */
+ENTRY(ftrace_graph_caller)
+	mcount_get_lr_addr	  x0	//     pointer to function's saved lr
+	mcount_get_pc		  x1	//     function's pc
+	mcount_get_parent_fp	  x2	//     parent's fp
+	bl	prepare_ftrace_return	// prepare_ftrace_return(&lr, pc, fp)
+
+	mcount_exit
+ENDPROC(ftrace_graph_caller)
+
+/*
+ * void return_to_handler(void)
+ *
+ * Run ftrace_return_to_handler() before going back to parent.
+ * @fp is checked against the value passed by ftrace_graph_caller()
+ * only when CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST is enabled.
+ */
+ENTRY(return_to_handler)
+	str	x0, [sp, #-16]!
+	mov	x0, x29			//     parent's fp
+	bl	ftrace_return_to_handler// addr = ftrace_return_to_hander(fp);
+	mov	x30, x0			// restore the original return address
+	ldr	x0, [sp], #16
+	ret
+END(return_to_handler)
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 3881fd115eb..9ce04ba6bcb 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -60,6 +60,9 @@
 	push	x0, x1
 	.if	\el == 0
 	mrs	x21, sp_el0
+	get_thread_info tsk			// Ensure MDSCR_EL1.SS is clear,
+	ldr	x19, [tsk, #TI_FLAGS]		// since we can unmask debug
+	disable_step_tsk x19, x20		// exceptions when scheduling.
 	.else
 	add	x21, sp, #S_FRAME_SIZE
 	.endif
@@ -259,7 +262,7 @@ el1_da:
 	 * Data abort handling
 	 */
 	mrs	x0, far_el1
-	enable_dbg_if_not_stepping x2
+	enable_dbg
 	// re-enable interrupts if they were enabled in the aborted context
 	tbnz	x23, #7, 1f			// PSR_I_BIT
 	enable_irq
@@ -275,27 +278,31 @@ el1_sp_pc:
 	 * Stack or PC alignment exception handling
 	 */
 	mrs	x0, far_el1
-	mov	x1, x25
+	enable_dbg
 	mov	x2, sp
 	b	do_sp_pc_abort
 el1_undef:
 	/*
 	 * Undefined instruction
 	 */
+	enable_dbg
 	mov	x0, sp
 	b	do_undefinstr
 el1_dbg:
 	/*
 	 * Debug exception handling
 	 */
+	cmp	x24, #ESR_EL1_EC_BRK64		// if BRK64
+	cinc	x24, x24, eq			// set bit '0'
 	tbz	x24, #0, el1_inv		// EL1 only
 	mrs	x0, far_el1
 	mov	x2, sp				// struct pt_regs
 	bl	do_debug_exception
-
+	enable_dbg
 	kernel_exit 1
 el1_inv:
 	// TODO: add support for undefined instructions in kernel mode
+	enable_dbg
 	mov	x0, sp
 	mov	x1, #BAD_SYNC
 	mrs	x2, esr_el1
@@ -305,20 +312,17 @@ ENDPROC(el1_sync)
 	.align	6
 el1_irq:
 	kernel_entry 1
-	enable_dbg_if_not_stepping x0
+	enable_dbg
 #ifdef CONFIG_TRACE_IRQFLAGS
 	bl	trace_hardirqs_off
 #endif
-#ifdef CONFIG_PREEMPT
-	get_thread_info tsk
-	ldr	x24, [tsk, #TI_PREEMPT]		// get preempt count
-	add	x0, x24, #1			// increment it
-	str	x0, [tsk, #TI_PREEMPT]
-#endif
+
 	irq_handler
+
 #ifdef CONFIG_PREEMPT
-	str	x24, [tsk, #TI_PREEMPT]		// restore preempt count
-	cbnz	x24, 1f				// preempt count != 0
+	get_thread_info tsk
+	ldr	w24, [tsk, #TI_PREEMPT]		// get preempt count
+	cbnz	w24, 1f				// preempt count != 0
 	ldr	x0, [tsk, #TI_FLAGS]		// get flags
 	tbz	x0, #TIF_NEED_RESCHED, 1f	// needs rescheduling?
 	bl	el1_preempt
@@ -333,8 +337,7 @@ ENDPROC(el1_irq)
 #ifdef CONFIG_PREEMPT
 el1_preempt:
 	mov	x24, lr
-1:	enable_dbg
-	bl	preempt_schedule_irq		// irq en/disable is done inside
+1:	bl	preempt_schedule_irq		// irq en/disable is done inside
 	ldr	x0, [tsk, #TI_FLAGS]		// get new tasks TI_FLAGS
 	tbnz	x0, #TIF_NEED_RESCHED, 1b	// needs rescheduling?
 	ret	x24
@@ -350,7 +353,7 @@ el0_sync:
 	lsr	x24, x25, #ESR_EL1_EC_SHIFT	// exception class
 	cmp	x24, #ESR_EL1_EC_SVC64		// SVC in 64-bit state
 	b.eq	el0_svc
-	adr	lr, ret_from_exception
+	adr	lr, ret_to_user
 	cmp	x24, #ESR_EL1_EC_DABT_EL0	// data abort in EL0
 	b.eq	el0_da
 	cmp	x24, #ESR_EL1_EC_IABT_EL0	// instruction abort in EL0
@@ -379,7 +382,7 @@ el0_sync_compat:
 	lsr	x24, x25, #ESR_EL1_EC_SHIFT	// exception class
 	cmp	x24, #ESR_EL1_EC_SVC32		// SVC in 32-bit state
 	b.eq	el0_svc_compat
-	adr	lr, ret_from_exception
+	adr	lr, ret_to_user
 	cmp	x24, #ESR_EL1_EC_DABT_EL0	// data abort in EL0
 	b.eq	el0_da
 	cmp	x24, #ESR_EL1_EC_IABT_EL0	// instruction abort in EL0
@@ -424,11 +427,8 @@ el0_da:
 	 */
 	mrs	x0, far_el1
 	bic	x0, x0, #(0xff << 56)
-	disable_step x1
-	isb
-	enable_dbg
 	// enable interrupts before calling the main handler
-	enable_irq
+	enable_dbg_and_irq
 	mov	x1, x25
 	mov	x2, sp
 	b	do_mem_abort
@@ -437,11 +437,8 @@ el0_ia:
 	 * Instruction abort handling
 	 */
 	mrs	x0, far_el1
-	disable_step x1
-	isb
-	enable_dbg
 	// enable interrupts before calling the main handler
-	enable_irq
+	enable_dbg_and_irq
 	orr	x1, x25, #1 << 24		// use reserved ISS bit for instruction aborts
 	mov	x2, sp
 	b	do_mem_abort
@@ -449,6 +446,7 @@ el0_fpsimd_acc:
 	/*
 	 * Floating Point or Advanced SIMD access
 	 */
+	enable_dbg
 	mov	x0, x25
 	mov	x1, sp
 	b	do_fpsimd_acc
@@ -456,6 +454,7 @@ el0_fpsimd_exc:
 	/*
 	 * Floating Point or Advanced SIMD exception
 	 */
+	enable_dbg
 	mov	x0, x25
 	mov	x1, sp
 	b	do_fpsimd_exc
@@ -464,11 +463,8 @@ el0_sp_pc:
 	 * Stack or PC alignment exception handling
 	 */
 	mrs	x0, far_el1
-	disable_step x1
-	isb
-	enable_dbg
 	// enable interrupts before calling the main handler
-	enable_irq
+	enable_dbg_and_irq
 	mov	x1, x25
 	mov	x2, sp
 	b	do_sp_pc_abort
@@ -476,9 +472,9 @@ el0_undef:
 	/*
 	 * Undefined instruction
 	 */
-	mov	x0, sp
 	// enable interrupts before calling the main handler
-	enable_irq
+	enable_dbg_and_irq
+	mov	x0, sp
 	b	do_undefinstr
 el0_dbg:
 	/*
@@ -486,11 +482,13 @@ el0_dbg:
 	 */
 	tbnz	x24, #0, el0_inv		// EL0 only
 	mrs	x0, far_el1
-	disable_step x1
 	mov	x1, x25
 	mov	x2, sp
-	b	do_debug_exception
+	bl	do_debug_exception
+	enable_dbg
+	b	ret_to_user
 el0_inv:
+	enable_dbg
 	mov	x0, sp
 	mov	x1, #BAD_SYNC
 	mrs	x2, esr_el1
@@ -501,28 +499,13 @@ ENDPROC(el0_sync)
 el0_irq:
 	kernel_entry 0
 el0_irq_naked:
-	disable_step x1
-	isb
 	enable_dbg
 #ifdef CONFIG_TRACE_IRQFLAGS
 	bl	trace_hardirqs_off
 #endif
-	get_thread_info tsk
-#ifdef CONFIG_PREEMPT
-	ldr	x24, [tsk, #TI_PREEMPT]		// get preempt count
-	add	x23, x24, #1			// increment it
-	str	x23, [tsk, #TI_PREEMPT]
-#endif
+
 	irq_handler
-#ifdef CONFIG_PREEMPT
-	ldr	x0, [tsk, #TI_PREEMPT]
-	str	x24, [tsk, #TI_PREEMPT]
-	cmp	x0, x23
-	b.eq	1f
-	mov	x1, #0
-	str	x1, [x1]			// BUG
-1:
-#endif
+
 #ifdef CONFIG_TRACE_IRQFLAGS
 	bl	trace_hardirqs_on
 #endif
@@ -530,14 +513,6 @@ el0_irq_naked:
 ENDPROC(el0_irq)
 
 /*
- * This is the return code to user mode for abort handlers
- */
-ret_from_exception:
-	get_thread_info tsk
-	b	ret_to_user
-ENDPROC(ret_from_exception)
-
-/*
  * Register switch for AArch64. The callee-saved registers need to be saved
  * and restored. On entry:
  *   x0 = previous task_struct (must be preserved across the switch)
@@ -576,10 +551,7 @@ ret_fast_syscall:
 	ldr	x1, [tsk, #TI_FLAGS]
 	and	x2, x1, #_TIF_WORK_MASK
 	cbnz	x2, fast_work_pending
-	tbz	x1, #TIF_SINGLESTEP, fast_exit
-	disable_dbg
-	enable_step x2
-fast_exit:
+	enable_step_tsk x1, x2
 	kernel_exit 0, ret = 1
 
 /*
@@ -589,7 +561,7 @@ fast_work_pending:
 	str	x0, [sp, #S_X0]			// returned x0
 work_pending:
 	tbnz	x1, #TIF_NEED_RESCHED, work_resched
-	/* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */
+	/* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
 	ldr	x2, [sp, #S_PSTATE]
 	mov	x0, sp				// 'regs'
 	tst	x2, #PSR_MODE_MASK		// user mode regs?
@@ -598,7 +570,6 @@ work_pending:
 	bl	do_notify_resume
 	b	ret_to_user
 work_resched:
-	enable_dbg
 	bl	schedule
 
 /*
@@ -609,9 +580,7 @@ ret_to_user:
 	ldr	x1, [tsk, #TI_FLAGS]
 	and	x2, x1, #_TIF_WORK_MASK
 	cbnz	x2, work_pending
-	tbz	x1, #TIF_SINGLESTEP, no_work_pending
-	disable_dbg
-	enable_step x2
+	enable_step_tsk x1, x2
 no_work_pending:
 	kernel_exit 0, ret = 0
 ENDPROC(ret_to_user)
@@ -638,14 +607,11 @@ el0_svc:
 	mov	sc_nr, #__NR_syscalls
 el0_svc_naked:					// compat entry point
 	stp	x0, scno, [sp, #S_ORIG_X0]	// save the original x0 and syscall number
-	disable_step x16
-	isb
-	enable_dbg
-	enable_irq
+	enable_dbg_and_irq
 
-	get_thread_info tsk
-	ldr	x16, [tsk, #TI_FLAGS]		// check for syscall tracing
-	tbnz	x16, #TIF_SYSCALL_TRACE, __sys_trace // are we tracing syscalls?
+	ldr	x16, [tsk, #TI_FLAGS]		// check for syscall hooks
+	tst	x16, #_TIF_SYSCALL_WORK
+	b.ne	__sys_trace
 	adr	lr, ret_fast_syscall		// return address
 	cmp     scno, sc_nr                     // check upper syscall limit
 	b.hs	ni_sys
@@ -661,9 +627,8 @@ ENDPROC(el0_svc)
 	 * switches, and waiting for our parent to respond.
 	 */
 __sys_trace:
-	mov	x1, sp
-	mov	w0, #0				// trace entry
-	bl	syscall_trace
+	mov	x0, sp
+	bl	syscall_trace_enter
 	adr	lr, __sys_trace_return		// return address
 	uxtw	scno, w0			// syscall number (possibly new)
 	mov	x1, sp				// pointer to regs
@@ -678,9 +643,8 @@ __sys_trace:
 
 __sys_trace_return:
 	str	x0, [sp]			// save returned x0
-	mov	x1, sp
-	mov	w0, #1				// trace exit
-	bl	syscall_trace
+	mov	x0, sp
+	bl	syscall_trace_exit
 	b	ret_to_user
 
 /*
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 1f2e4d5a5c0..ad8aebb1cde 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -17,6 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/cpu_pm.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sched.h>
@@ -34,6 +35,60 @@
 #define FPEXC_IDF	(1 << 7)
 
 /*
+ * In order to reduce the number of times the FPSIMD state is needlessly saved
+ * and restored, we need to keep track of two things:
+ * (a) for each task, we need to remember which CPU was the last one to have
+ *     the task's FPSIMD state loaded into its FPSIMD registers;
+ * (b) for each CPU, we need to remember which task's userland FPSIMD state has
+ *     been loaded into its FPSIMD registers most recently, or whether it has
+ *     been used to perform kernel mode NEON in the meantime.
+ *
+ * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to
+ * the id of the current CPU everytime the state is loaded onto a CPU. For (b),
+ * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the
+ * address of the userland FPSIMD state of the task that was loaded onto the CPU
+ * the most recently, or NULL if kernel mode NEON has been performed after that.
+ *
+ * With this in place, we no longer have to restore the next FPSIMD state right
+ * when switching between tasks. Instead, we can defer this check to userland
+ * resume, at which time we verify whether the CPU's fpsimd_last_state and the
+ * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we
+ * can omit the FPSIMD restore.
+ *
+ * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to
+ * indicate whether or not the userland FPSIMD state of the current task is
+ * present in the registers. The flag is set unless the FPSIMD registers of this
+ * CPU currently contain the most recent userland FPSIMD state of the current
+ * task.
+ *
+ * For a certain task, the sequence may look something like this:
+ * - the task gets scheduled in; if both the task's fpsimd_state.cpu field
+ *   contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu
+ *   variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is
+ *   cleared, otherwise it is set;
+ *
+ * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's
+ *   userland FPSIMD state is copied from memory to the registers, the task's
+ *   fpsimd_state.cpu field is set to the id of the current CPU, the current
+ *   CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the
+ *   TIF_FOREIGN_FPSTATE flag is cleared;
+ *
+ * - the task executes an ordinary syscall; upon return to userland, the
+ *   TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is
+ *   restored;
+ *
+ * - the task executes a syscall which executes some NEON instructions; this is
+ *   preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD
+ *   register contents to memory, clears the fpsimd_last_state per-cpu variable
+ *   and sets the TIF_FOREIGN_FPSTATE flag;
+ *
+ * - the task gets preempted after kernel_neon_end() is called; as we have not
+ *   returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so
+ *   whatever is in the FPSIMD registers is not saved to memory, but discarded.
+ */
+static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
+
+/*
  * Trapped FP/ASIMD access.
  */
 void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs)
@@ -71,46 +126,175 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
 
 void fpsimd_thread_switch(struct task_struct *next)
 {
-	/* check if not kernel threads */
-	if (current->mm)
+	/*
+	 * Save the current FPSIMD state to memory, but only if whatever is in
+	 * the registers is in fact the most recent userland FPSIMD state of
+	 * 'current'.
+	 */
+	if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
 		fpsimd_save_state(&current->thread.fpsimd_state);
-	if (next->mm)
-		fpsimd_load_state(&next->thread.fpsimd_state);
+
+	if (next->mm) {
+		/*
+		 * If we are switching to a task whose most recent userland
+		 * FPSIMD state is already in the registers of *this* cpu,
+		 * we can skip loading the state from memory. Otherwise, set
+		 * the TIF_FOREIGN_FPSTATE flag so the state will be loaded
+		 * upon the next return to userland.
+		 */
+		struct fpsimd_state *st = &next->thread.fpsimd_state;
+
+		if (__this_cpu_read(fpsimd_last_state) == st
+		    && st->cpu == smp_processor_id())
+			clear_ti_thread_flag(task_thread_info(next),
+					     TIF_FOREIGN_FPSTATE);
+		else
+			set_ti_thread_flag(task_thread_info(next),
+					   TIF_FOREIGN_FPSTATE);
+	}
 }
 
 void fpsimd_flush_thread(void)
 {
 	memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
-	fpsimd_load_state(&current->thread.fpsimd_state);
+	set_thread_flag(TIF_FOREIGN_FPSTATE);
 }
 
-#ifdef CONFIG_KERNEL_MODE_NEON
+/*
+ * Save the userland FPSIMD state of 'current' to memory, but only if the state
+ * currently held in the registers does in fact belong to 'current'
+ */
+void fpsimd_preserve_current_state(void)
+{
+	preempt_disable();
+	if (!test_thread_flag(TIF_FOREIGN_FPSTATE))
+		fpsimd_save_state(&current->thread.fpsimd_state);
+	preempt_enable();
+}
 
 /*
- * Kernel-side NEON support functions
+ * Load the userland FPSIMD state of 'current' from memory, but only if the
+ * FPSIMD state already held in the registers is /not/ the most recent FPSIMD
+ * state of 'current'
  */
-void kernel_neon_begin(void)
+void fpsimd_restore_current_state(void)
 {
-	/* Avoid using the NEON in interrupt context */
-	BUG_ON(in_interrupt());
 	preempt_disable();
+	if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
+		struct fpsimd_state *st = &current->thread.fpsimd_state;
 
-	if (current->mm)
-		fpsimd_save_state(&current->thread.fpsimd_state);
+		fpsimd_load_state(st);
+		this_cpu_write(fpsimd_last_state, st);
+		st->cpu = smp_processor_id();
+	}
+	preempt_enable();
 }
-EXPORT_SYMBOL(kernel_neon_begin);
 
-void kernel_neon_end(void)
+/*
+ * Load an updated userland FPSIMD state for 'current' from memory and set the
+ * flag that indicates that the FPSIMD register contents are the most recent
+ * FPSIMD state of 'current'
+ */
+void fpsimd_update_current_state(struct fpsimd_state *state)
 {
-	if (current->mm)
-		fpsimd_load_state(&current->thread.fpsimd_state);
+	preempt_disable();
+	fpsimd_load_state(state);
+	if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
+		struct fpsimd_state *st = &current->thread.fpsimd_state;
 
+		this_cpu_write(fpsimd_last_state, st);
+		st->cpu = smp_processor_id();
+	}
 	preempt_enable();
 }
+
+/*
+ * Invalidate live CPU copies of task t's FPSIMD state
+ */
+void fpsimd_flush_task_state(struct task_struct *t)
+{
+	t->thread.fpsimd_state.cpu = NR_CPUS;
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+
+static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate);
+static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate);
+
+/*
+ * Kernel-side NEON support functions
+ */
+void kernel_neon_begin_partial(u32 num_regs)
+{
+	if (in_interrupt()) {
+		struct fpsimd_partial_state *s = this_cpu_ptr(
+			in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
+
+		BUG_ON(num_regs > 32);
+		fpsimd_save_partial_state(s, roundup(num_regs, 2));
+	} else {
+		/*
+		 * Save the userland FPSIMD state if we have one and if we
+		 * haven't done so already. Clear fpsimd_last_state to indicate
+		 * that there is no longer userland FPSIMD state in the
+		 * registers.
+		 */
+		preempt_disable();
+		if (current->mm &&
+		    !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE))
+			fpsimd_save_state(&current->thread.fpsimd_state);
+		this_cpu_write(fpsimd_last_state, NULL);
+	}
+}
+EXPORT_SYMBOL(kernel_neon_begin_partial);
+
+void kernel_neon_end(void)
+{
+	if (in_interrupt()) {
+		struct fpsimd_partial_state *s = this_cpu_ptr(
+			in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
+		fpsimd_load_partial_state(s);
+	} else {
+		preempt_enable();
+	}
+}
 EXPORT_SYMBOL(kernel_neon_end);
 
 #endif /* CONFIG_KERNEL_MODE_NEON */
 
+#ifdef CONFIG_CPU_PM
+static int fpsimd_cpu_pm_notifier(struct notifier_block *self,
+				  unsigned long cmd, void *v)
+{
+	switch (cmd) {
+	case CPU_PM_ENTER:
+		if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
+			fpsimd_save_state(&current->thread.fpsimd_state);
+		break;
+	case CPU_PM_EXIT:
+		if (current->mm)
+			set_thread_flag(TIF_FOREIGN_FPSTATE);
+		break;
+	case CPU_PM_ENTER_FAILED:
+	default:
+		return NOTIFY_DONE;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block fpsimd_cpu_pm_notifier_block = {
+	.notifier_call = fpsimd_cpu_pm_notifier,
+};
+
+static void fpsimd_pm_init(void)
+{
+	cpu_pm_register_notifier(&fpsimd_cpu_pm_notifier_block);
+}
+
+#else
+static inline void fpsimd_pm_init(void) { }
+#endif /* CONFIG_CPU_PM */
+
 /*
  * FP/SIMD support code initialisation.
  */
@@ -129,6 +313,8 @@ static int __init fpsimd_init(void)
 	else
 		elf_hwcap |= HWCAP_ASIMD;
 
+	fpsimd_pm_init();
+
 	return 0;
 }
 late_initcall(fpsimd_init);
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
new file mode 100644
index 00000000000..7924d73b647
--- /dev/null
+++ b/arch/arm64/kernel/ftrace.c
@@ -0,0 +1,176 @@
+/*
+ * arch/arm64/kernel/ftrace.c
+ *
+ * Copyright (C) 2013 Linaro Limited
+ * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ftrace.h>
+#include <linux/swab.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+#include <asm/ftrace.h>
+#include <asm/insn.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ * Replace a single instruction, which may be a branch or NOP.
+ * If @validate == true, a replaced instruction is checked against 'old'.
+ */
+static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
+			      bool validate)
+{
+	u32 replaced;
+
+	/*
+	 * Note:
+	 * Due to modules and __init, code can disappear and change,
+	 * we need to protect against faulting as well as code changing.
+	 * We do this by aarch64_insn_*() which use the probe_kernel_*().
+	 *
+	 * No lock is held here because all the modifications are run
+	 * through stop_machine().
+	 */
+	if (validate) {
+		if (aarch64_insn_read((void *)pc, &replaced))
+			return -EFAULT;
+
+		if (replaced != old)
+			return -EINVAL;
+	}
+	if (aarch64_insn_patch_text_nosync((void *)pc, new))
+		return -EPERM;
+
+	return 0;
+}
+
+/*
+ * Replace tracer function in ftrace_caller()
+ */
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long pc;
+	u32 new;
+
+	pc = (unsigned long)&ftrace_call;
+	new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, true);
+
+	return ftrace_modify_code(pc, 0, new, false);
+}
+
+/*
+ * Turn on the call to ftrace_caller() in instrumented function
+ */
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long pc = rec->ip;
+	u32 old, new;
+
+	old = aarch64_insn_gen_nop();
+	new = aarch64_insn_gen_branch_imm(pc, addr, true);
+
+	return ftrace_modify_code(pc, old, new, true);
+}
+
+/*
+ * Turn off the call to ftrace_caller() in instrumented function
+ */
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
+		    unsigned long addr)
+{
+	unsigned long pc = rec->ip;
+	u32 old, new;
+
+	old = aarch64_insn_gen_branch_imm(pc, addr, true);
+	new = aarch64_insn_gen_nop();
+
+	return ftrace_modify_code(pc, old, new, true);
+}
+
+int __init ftrace_dyn_arch_init(void)
+{
+	return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/*
+ * function_graph tracer expects ftrace_return_to_handler() to be called
+ * on the way back to parent. For this purpose, this function is called
+ * in _mcount() or ftrace_caller() to replace return address (*parent) on
+ * the call stack to return_to_handler.
+ *
+ * Note that @frame_pointer is used only for sanity check later.
+ */
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
+			   unsigned long frame_pointer)
+{
+	unsigned long return_hooker = (unsigned long)&return_to_handler;
+	unsigned long old;
+	struct ftrace_graph_ent trace;
+	int err;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
+		return;
+
+	/*
+	 * Note:
+	 * No protection against faulting at *parent, which may be seen
+	 * on other archs. It's unlikely on AArch64.
+	 */
+	old = *parent;
+	*parent = return_hooker;
+
+	trace.func = self_addr;
+	trace.depth = current->curr_ret_stack + 1;
+
+	/* Only trace if the calling function expects to */
+	if (!ftrace_graph_entry(&trace)) {
+		*parent = old;
+		return;
+	}
+
+	err = ftrace_push_return_trace(old, self_addr, &trace.depth,
+				       frame_pointer);
+	if (err == -EBUSY) {
+		*parent = old;
+		return;
+	}
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ * Turn on/off the call to ftrace_graph_caller() in ftrace_caller()
+ * depending on @enable.
+ */
+static int ftrace_modify_graph_caller(bool enable)
+{
+	unsigned long pc = (unsigned long)&ftrace_graph_call;
+	u32 branch, nop;
+
+	branch = aarch64_insn_gen_branch_imm(pc,
+			(unsigned long)ftrace_graph_caller, false);
+	nop = aarch64_insn_gen_nop();
+
+	if (enable)
+		return ftrace_modify_code(pc, nop, branch, true);
+	else
+		return ftrace_modify_code(pc, branch, nop, true);
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+	return ftrace_modify_graph_caller(true);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+	return ftrace_modify_graph_caller(false);
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 7090c126797..a2c1195abb7 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -26,6 +26,7 @@
 #include <asm/assembler.h>
 #include <asm/ptrace.h>
 #include <asm/asm-offsets.h>
+#include <asm/cache.h>
 #include <asm/cputype.h>
 #include <asm/memory.h>
 #include <asm/thread_info.h>
@@ -107,8 +108,18 @@
 	/*
 	 * DO NOT MODIFY. Image header expected by Linux boot-loaders.
 	 */
+#ifdef CONFIG_EFI
+efi_head:
+	/*
+	 * This add instruction has no meaningful effect except that
+	 * its opcode forms the magic "MZ" signature required by UEFI.
+	 */
+	add	x13, x18, #0x16
+	b	stext
+#else
 	b	stext				// branch to kernel start, magic
 	.long	0				// reserved
+#endif
 	.quad	TEXT_OFFSET			// Image load offset from start of RAM
 	.quad	0				// reserved
 	.quad	0				// reserved
@@ -119,12 +130,115 @@
 	.byte	0x52
 	.byte	0x4d
 	.byte	0x64
+#ifdef CONFIG_EFI
+	.long	pe_header - efi_head		// Offset to the PE header.
+#else
 	.word	0				// reserved
+#endif
+
+#ifdef CONFIG_EFI
+	.align 3
+pe_header:
+	.ascii	"PE"
+	.short 	0
+coff_header:
+	.short	0xaa64				// AArch64
+	.short	2				// nr_sections
+	.long	0 				// TimeDateStamp
+	.long	0				// PointerToSymbolTable
+	.long	1				// NumberOfSymbols
+	.short	section_table - optional_header	// SizeOfOptionalHeader
+	.short	0x206				// Characteristics.
+						// IMAGE_FILE_DEBUG_STRIPPED |
+						// IMAGE_FILE_EXECUTABLE_IMAGE |
+						// IMAGE_FILE_LINE_NUMS_STRIPPED
+optional_header:
+	.short	0x20b				// PE32+ format
+	.byte	0x02				// MajorLinkerVersion
+	.byte	0x14				// MinorLinkerVersion
+	.long	_edata - stext			// SizeOfCode
+	.long	0				// SizeOfInitializedData
+	.long	0				// SizeOfUninitializedData
+	.long	efi_stub_entry - efi_head	// AddressOfEntryPoint
+	.long	stext - efi_head		// BaseOfCode
+
+extra_header_fields:
+	.quad	0				// ImageBase
+	.long	0x20				// SectionAlignment
+	.long	0x8				// FileAlignment
+	.short	0				// MajorOperatingSystemVersion
+	.short	0				// MinorOperatingSystemVersion
+	.short	0				// MajorImageVersion
+	.short	0				// MinorImageVersion
+	.short	0				// MajorSubsystemVersion
+	.short	0				// MinorSubsystemVersion
+	.long	0				// Win32VersionValue
+
+	.long	_edata - efi_head		// SizeOfImage
+
+	// Everything before the kernel image is considered part of the header
+	.long	stext - efi_head		// SizeOfHeaders
+	.long	0				// CheckSum
+	.short	0xa				// Subsystem (EFI application)
+	.short	0				// DllCharacteristics
+	.quad	0				// SizeOfStackReserve
+	.quad	0				// SizeOfStackCommit
+	.quad	0				// SizeOfHeapReserve
+	.quad	0				// SizeOfHeapCommit
+	.long	0				// LoaderFlags
+	.long	0x6				// NumberOfRvaAndSizes
+
+	.quad	0				// ExportTable
+	.quad	0				// ImportTable
+	.quad	0				// ResourceTable
+	.quad	0				// ExceptionTable
+	.quad	0				// CertificationTable
+	.quad	0				// BaseRelocationTable
+
+	// Section table
+section_table:
+
+	/*
+	 * The EFI application loader requires a relocation section
+	 * because EFI applications must be relocatable.  This is a
+	 * dummy section as far as we are concerned.
+	 */
+	.ascii	".reloc"
+	.byte	0
+	.byte	0			// end of 0 padding of section name
+	.long	0
+	.long	0
+	.long	0			// SizeOfRawData
+	.long	0			// PointerToRawData
+	.long	0			// PointerToRelocations
+	.long	0			// PointerToLineNumbers
+	.short	0			// NumberOfRelocations
+	.short	0			// NumberOfLineNumbers
+	.long	0x42100040		// Characteristics (section flags)
+
+
+	.ascii	".text"
+	.byte	0
+	.byte	0
+	.byte	0        		// end of 0 padding of section name
+	.long	_edata - stext		// VirtualSize
+	.long	stext - efi_head	// VirtualAddress
+	.long	_edata - stext		// SizeOfRawData
+	.long	stext - efi_head	// PointerToRawData
+
+	.long	0		// PointerToRelocations (0 for executables)
+	.long	0		// PointerToLineNumbers (0 for executables)
+	.short	0		// NumberOfRelocations  (0 for executables)
+	.short	0		// NumberOfLineNumbers  (0 for executables)
+	.long	0xe0500020	// Characteristics (section flags)
+	.align 5
+#endif
 
 ENTRY(stext)
 	mov	x21, x0				// x21=FDT
+	bl	el2_setup			// Drop to EL1, w20=cpu_boot_mode
 	bl	__calc_phys_offset		// x24=PHYS_OFFSET, x28=PHYS_OFFSET-PAGE_OFFSET
-	bl	el2_setup			// Drop to EL1
+	bl	set_cpu_boot_mode_flag
 	mrs	x22, midr_el1			// x22=cpuid
 	mov	x0, x22
 	bl	lookup_processor_type
@@ -150,21 +264,29 @@ ENDPROC(stext)
 /*
  * If we're fortunate enough to boot at EL2, ensure that the world is
  * sane before dropping to EL1.
+ *
+ * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in x20 if
+ * booted in EL1 or EL2 respectively.
  */
 ENTRY(el2_setup)
 	mrs	x0, CurrentEL
-	cmp	x0, #PSR_MODE_EL2t
-	ccmp	x0, #PSR_MODE_EL2h, #0x4, ne
-	ldr	x0, =__boot_cpu_mode		// Compute __boot_cpu_mode
-	add	x0, x0, x28
-	b.eq	1f
-	str	wzr, [x0]			// Remember we don't have EL2...
+	cmp	x0, #CurrentEL_EL2
+	b.ne	1f
+	mrs	x0, sctlr_el2
+CPU_BE(	orr	x0, x0, #(1 << 25)	)	// Set the EE bit for EL2
+CPU_LE(	bic	x0, x0, #(1 << 25)	)	// Clear the EE bit for EL2
+	msr	sctlr_el2, x0
+	b	2f
+1:	mrs	x0, sctlr_el1
+CPU_BE(	orr	x0, x0, #(3 << 24)	)	// Set the EE and E0E bits for EL1
+CPU_LE(	bic	x0, x0, #(3 << 24)	)	// Clear the EE and E0E bits for EL1
+	msr	sctlr_el1, x0
+	mov	w20, #BOOT_CPU_MODE_EL1		// This cpu booted in EL1
+	isb
 	ret
 
 	/* Hyp configuration. */
-1:	ldr	w1, =BOOT_CPU_MODE_EL2
-	str	w1, [x0, #4]			// This CPU has EL2
-	mov	x0, #(1 << 31)			// 64-bit EL1
+2:	mov	x0, #(1 << 31)			// 64-bit EL1
 	msr	hcr_el2, x0
 
 	/* Generic timers. */
@@ -181,7 +303,8 @@ ENTRY(el2_setup)
 
 	/* sctlr_el1 */
 	mov	x0, #0x0800			// Set/clear RES{1,0} bits
-	movk	x0, #0x30d0, lsl #16
+CPU_BE(	movk	x0, #0x33d0, lsl #16	)	// Set EE and E0E on BE systems
+CPU_LE(	movk	x0, #0x30d0, lsl #16	)	// Clear EE and E0E on LE systems
 	msr	sctlr_el1, x0
 
 	/* Coprocessor traps. */
@@ -204,18 +327,36 @@ ENTRY(el2_setup)
 		      PSR_MODE_EL1h)
 	msr	spsr_el2, x0
 	msr	elr_el2, lr
+	mov	w20, #BOOT_CPU_MODE_EL2		// This CPU booted in EL2
 	eret
 ENDPROC(el2_setup)
 
 /*
+ * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed
+ * in x20. See arch/arm64/include/asm/virt.h for more info.
+ */
+ENTRY(set_cpu_boot_mode_flag)
+	ldr	x1, =__boot_cpu_mode		// Compute __boot_cpu_mode
+	add	x1, x1, x28
+	cmp	w20, #BOOT_CPU_MODE_EL2
+	b.ne	1f
+	add	x1, x1, #4
+1:	str	w20, [x1]			// This CPU has booted in EL1
+	dmb	sy
+	dc	ivac, x1			// Invalidate potentially stale cache line
+	ret
+ENDPROC(set_cpu_boot_mode_flag)
+
+/*
  * We need to find out the CPU boot mode long after boot, so we need to
  * store it in a writable variable.
  *
  * This is not in .bss, because we set it sufficiently early that the boot-time
  * zeroing of .bss would clobber it.
  */
-	.pushsection	.data
+	.pushsection	.data..cacheline_aligned
 ENTRY(__boot_cpu_mode)
+	.align	L1_CACHE_SHIFT
 	.long	BOOT_CPU_MODE_EL2
 	.long	0
 	.popsection
@@ -225,7 +366,6 @@ ENTRY(__boot_cpu_mode)
 	.quad	PAGE_OFFSET
 
 #ifdef CONFIG_SMP
-	.pushsection    .smp.pen.text, "ax"
 	.align	3
 1:	.quad	.
 	.quad	secondary_holding_pen_release
@@ -235,8 +375,9 @@ ENTRY(__boot_cpu_mode)
 	 * cores are held until we're ready for them to initialise.
 	 */
 ENTRY(secondary_holding_pen)
-	bl	__calc_phys_offset		// x24=phys offset
-	bl	el2_setup			// Drop to EL1
+	bl	el2_setup			// Drop to EL1, w20=cpu_boot_mode
+	bl	__calc_phys_offset		// x24=PHYS_OFFSET, x28=PHYS_OFFSET-PAGE_OFFSET
+	bl	set_cpu_boot_mode_flag
 	mrs	x0, mpidr_el1
 	ldr     x1, =MPIDR_HWID_BITMASK
 	and	x0, x0, x1
@@ -250,7 +391,17 @@ pen:	ldr	x4, [x3]
 	wfe
 	b	pen
 ENDPROC(secondary_holding_pen)
-	.popsection
+
+	/*
+	 * Secondary entry point that jumps straight into the kernel. Only to
+	 * be used where CPUs are brought online dynamically by the kernel.
+	 */
+ENTRY(secondary_entry)
+	bl	el2_setup			// Drop to EL1
+	bl	__calc_phys_offset		// x24=PHYS_OFFSET, x28=PHYS_OFFSET-PAGE_OFFSET
+	bl	set_cpu_boot_mode_flag
+	b	secondary_startup
+ENDPROC(secondary_entry)
 
 ENTRY(secondary_startup)
 	/*
@@ -348,26 +499,18 @@ ENDPROC(__calc_phys_offset)
  * Preserves:	tbl, flags
  * Corrupts:	phys, start, end, pstate
  */
-	.macro	create_block_map, tbl, flags, phys, start, end, idmap=0
+	.macro	create_block_map, tbl, flags, phys, start, end
 	lsr	\phys, \phys, #BLOCK_SHIFT
-	.if	\idmap
-	and	\start, \phys, #PTRS_PER_PTE - 1	// table index
-	.else
 	lsr	\start, \start, #BLOCK_SHIFT
 	and	\start, \start, #PTRS_PER_PTE - 1	// table index
-	.endif
 	orr	\phys, \flags, \phys, lsl #BLOCK_SHIFT	// table entry
-	.ifnc	\start,\end
 	lsr	\end, \end, #BLOCK_SHIFT
 	and	\end, \end, #PTRS_PER_PTE - 1		// table end index
-	.endif
 9999:	str	\phys, [\tbl, \start, lsl #3]		// store the entry
-	.ifnc	\start,\end
 	add	\start, \start, #1			// next entry
 	add	\phys, \phys, #BLOCK_SIZE		// next block
 	cmp	\start, \end
 	b.ls	9999b
-	.endif
 	.endm
 
 /*
@@ -376,10 +519,19 @@ ENDPROC(__calc_phys_offset)
  *   - identity mapping to enable the MMU (low address, TTBR0)
  *   - first few MB of the kernel linear mapping to jump to once the MMU has
  *     been enabled, including the FDT blob (TTBR1)
- *   - UART mapping if CONFIG_EARLY_PRINTK is enabled (TTBR1)
+ *   - pgd entry for fixed mappings (TTBR1)
  */
 __create_page_tables:
 	pgtbl	x25, x26, x24			// idmap_pg_dir and swapper_pg_dir addresses
+	mov	x27, lr
+
+	/*
+	 * Invalidate the idmap and swapper page tables to avoid potential
+	 * dirty cache lines being evicted.
+	 */
+	mov	x0, x25
+	add	x1, x26, #SWAPPER_DIR_SIZE
+	bl	__inval_cache_range
 
 	/*
 	 * Clear the idmap and swapper page tables.
@@ -399,9 +551,13 @@ __create_page_tables:
 	 * Create the identity mapping.
 	 */
 	add	x0, x25, #PAGE_SIZE		// section table address
-	adr	x3, __turn_mmu_on		// virtual/physical address
+	ldr	x3, =KERNEL_START
+	add	x3, x3, x28			// __pa(KERNEL_START)
 	create_pgd_entry x25, x0, x3, x5, x6
-	create_block_map x0, x7, x3, x5, x5, idmap=1
+	ldr	x6, =KERNEL_END
+	mov	x5, x3				// __pa(KERNEL_START)
+	add	x6, x6, x28			// __pa(KERNEL_END)
+	create_block_map x0, x7, x3, x5, x6
 
 	/*
 	 * Map the kernel image (starting with PHYS_OFFSET).
@@ -409,7 +565,7 @@ __create_page_tables:
 	add	x0, x26, #PAGE_SIZE		// section table address
 	mov	x5, #PAGE_OFFSET
 	create_pgd_entry x26, x0, x5, x3, x6
-	ldr	x6, =KERNEL_END - 1
+	ldr	x6, =KERNEL_END
 	mov	x3, x24				// phys offset
 	create_block_map x0, x7, x3, x5, x6
 
@@ -429,15 +585,23 @@ __create_page_tables:
 	sub	x6, x6, #1			// inclusive range
 	create_block_map x0, x7, x3, x5, x6
 1:
-#ifdef CONFIG_EARLY_PRINTK
 	/*
-	 * Create the pgd entry for the UART mapping. The full mapping is done
-	 * later based earlyprintk kernel parameter.
+	 * Create the pgd entry for the fixed mappings.
 	 */
-	ldr	x5, =EARLYCON_IOBASE		// UART virtual address
+	ldr	x5, =FIXADDR_TOP		// Fixed mapping virtual address
 	add	x0, x26, #2 * PAGE_SIZE		// section table address
 	create_pgd_entry x26, x0, x5, x6, x7
-#endif
+
+	/*
+	 * Since the page tables have been populated with non-cacheable
+	 * accesses (MMU disabled), invalidate the idmap and swapper page
+	 * tables again to remove any speculatively loaded cache lines.
+	 */
+	mov	x0, x25
+	add	x1, x26, #SWAPPER_DIR_SIZE
+	bl	__inval_cache_range
+
+	mov	lr, x27
 	ret
 ENDPROC(__create_page_tables)
 	.ltorg
@@ -446,8 +610,6 @@ ENDPROC(__create_page_tables)
 	.type	__switch_data, %object
 __switch_data:
 	.quad	__mmap_switched
-	.quad	__data_loc			// x4
-	.quad	_data				// x5
 	.quad	__bss_start			// x6
 	.quad	_end				// x7
 	.quad	processor_id			// x4
@@ -462,15 +624,7 @@ __switch_data:
 __mmap_switched:
 	adr	x3, __switch_data + 8
 
-	ldp	x4, x5, [x3], #16
 	ldp	x6, x7, [x3], #16
-	cmp	x4, x5				// Copy data segment if needed
-1:	ccmp	x5, x6, #4, ne
-	b.eq	2f
-	ldr	x16, [x4], #8
-	str	x16, [x5], #8
-	b	1b
-2:
 1:	cmp	x6, x7
 	b.hs	2f
 	str	xzr, [x6], #8			// Clear BSS
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 329218ca9ff..df1cf15377b 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -20,13 +20,14 @@
 
 #define pr_fmt(fmt) "hw-breakpoint: " fmt
 
+#include <linux/compat.h>
+#include <linux/cpu_pm.h>
 #include <linux/errno.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/perf_event.h>
 #include <linux/ptrace.h>
 #include <linux/smp.h>
 
-#include <asm/compat.h>
 #include <asm/current.h>
 #include <asm/debug-monitors.h>
 #include <asm/hw_breakpoint.h>
@@ -169,94 +170,134 @@ static enum debug_el debug_exception_level(int privilege)
 	}
 }
 
-/*
- * Install a perf counter breakpoint.
+enum hw_breakpoint_ops {
+	HW_BREAKPOINT_INSTALL,
+	HW_BREAKPOINT_UNINSTALL,
+	HW_BREAKPOINT_RESTORE
+};
+
+/**
+ * hw_breakpoint_slot_setup - Find and setup a perf slot according to
+ *			      operations
+ *
+ * @slots: pointer to array of slots
+ * @max_slots: max number of slots
+ * @bp: perf_event to setup
+ * @ops: operation to be carried out on the slot
+ *
+ * Return:
+ *	slot index on success
+ *	-ENOSPC if no slot is available/matches
+ *	-EINVAL on wrong operations parameter
  */
-int arch_install_hw_breakpoint(struct perf_event *bp)
+static int hw_breakpoint_slot_setup(struct perf_event **slots, int max_slots,
+				    struct perf_event *bp,
+				    enum hw_breakpoint_ops ops)
+{
+	int i;
+	struct perf_event **slot;
+
+	for (i = 0; i < max_slots; ++i) {
+		slot = &slots[i];
+		switch (ops) {
+		case HW_BREAKPOINT_INSTALL:
+			if (!*slot) {
+				*slot = bp;
+				return i;
+			}
+			break;
+		case HW_BREAKPOINT_UNINSTALL:
+			if (*slot == bp) {
+				*slot = NULL;
+				return i;
+			}
+			break;
+		case HW_BREAKPOINT_RESTORE:
+			if (*slot == bp)
+				return i;
+			break;
+		default:
+			pr_warn_once("Unhandled hw breakpoint ops %d\n", ops);
+			return -EINVAL;
+		}
+	}
+	return -ENOSPC;
+}
+
+static int hw_breakpoint_control(struct perf_event *bp,
+				 enum hw_breakpoint_ops ops)
 {
 	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
-	struct perf_event **slot, **slots;
+	struct perf_event **slots;
 	struct debug_info *debug_info = &current->thread.debug;
 	int i, max_slots, ctrl_reg, val_reg, reg_enable;
+	enum debug_el dbg_el = debug_exception_level(info->ctrl.privilege);
 	u32 ctrl;
 
 	if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
 		/* Breakpoint */
 		ctrl_reg = AARCH64_DBG_REG_BCR;
 		val_reg = AARCH64_DBG_REG_BVR;
-		slots = __get_cpu_var(bp_on_reg);
+		slots = this_cpu_ptr(bp_on_reg);
 		max_slots = core_num_brps;
 		reg_enable = !debug_info->bps_disabled;
 	} else {
 		/* Watchpoint */
 		ctrl_reg = AARCH64_DBG_REG_WCR;
 		val_reg = AARCH64_DBG_REG_WVR;
-		slots = __get_cpu_var(wp_on_reg);
+		slots = this_cpu_ptr(wp_on_reg);
 		max_slots = core_num_wrps;
 		reg_enable = !debug_info->wps_disabled;
 	}
 
-	for (i = 0; i < max_slots; ++i) {
-		slot = &slots[i];
-
-		if (!*slot) {
-			*slot = bp;
-			break;
-		}
-	}
-
-	if (WARN_ONCE(i == max_slots, "Can't find any breakpoint slot"))
-		return -ENOSPC;
+	i = hw_breakpoint_slot_setup(slots, max_slots, bp, ops);
 
-	/* Ensure debug monitors are enabled at the correct exception level.  */
-	enable_debug_monitors(debug_exception_level(info->ctrl.privilege));
+	if (WARN_ONCE(i < 0, "Can't find any breakpoint slot"))
+		return i;
 
-	/* Setup the address register. */
-	write_wb_reg(val_reg, i, info->address);
+	switch (ops) {
+	case HW_BREAKPOINT_INSTALL:
+		/*
+		 * Ensure debug monitors are enabled at the correct exception
+		 * level.
+		 */
+		enable_debug_monitors(dbg_el);
+		/* Fall through */
+	case HW_BREAKPOINT_RESTORE:
+		/* Setup the address register. */
+		write_wb_reg(val_reg, i, info->address);
+
+		/* Setup the control register. */
+		ctrl = encode_ctrl_reg(info->ctrl);
+		write_wb_reg(ctrl_reg, i,
+			     reg_enable ? ctrl | 0x1 : ctrl & ~0x1);
+		break;
+	case HW_BREAKPOINT_UNINSTALL:
+		/* Reset the control register. */
+		write_wb_reg(ctrl_reg, i, 0);
 
-	/* Setup the control register. */
-	ctrl = encode_ctrl_reg(info->ctrl);
-	write_wb_reg(ctrl_reg, i, reg_enable ? ctrl | 0x1 : ctrl & ~0x1);
+		/*
+		 * Release the debug monitors for the correct exception
+		 * level.
+		 */
+		disable_debug_monitors(dbg_el);
+		break;
+	}
 
 	return 0;
 }
 
-void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+/*
+ * Install a perf counter breakpoint.
+ */
+int arch_install_hw_breakpoint(struct perf_event *bp)
 {
-	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
-	struct perf_event **slot, **slots;
-	int i, max_slots, base;
-
-	if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
-		/* Breakpoint */
-		base = AARCH64_DBG_REG_BCR;
-		slots = __get_cpu_var(bp_on_reg);
-		max_slots = core_num_brps;
-	} else {
-		/* Watchpoint */
-		base = AARCH64_DBG_REG_WCR;
-		slots = __get_cpu_var(wp_on_reg);
-		max_slots = core_num_wrps;
-	}
-
-	/* Remove the breakpoint. */
-	for (i = 0; i < max_slots; ++i) {
-		slot = &slots[i];
-
-		if (*slot == bp) {
-			*slot = NULL;
-			break;
-		}
-	}
-
-	if (WARN_ONCE(i == max_slots, "Can't find any breakpoint slot"))
-		return;
-
-	/* Reset the control register. */
-	write_wb_reg(base, i, 0);
+	return hw_breakpoint_control(bp, HW_BREAKPOINT_INSTALL);
+}
 
-	/* Release the debug monitors for the correct exception level.  */
-	disable_debug_monitors(debug_exception_level(info->ctrl.privilege));
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+{
+	hw_breakpoint_control(bp, HW_BREAKPOINT_UNINSTALL);
 }
 
 static int get_hbp_len(u8 hbp_len)
@@ -505,11 +546,11 @@ static void toggle_bp_registers(int reg, enum debug_el el, int enable)
 
 	switch (reg) {
 	case AARCH64_DBG_REG_BCR:
-		slots = __get_cpu_var(bp_on_reg);
+		slots = this_cpu_ptr(bp_on_reg);
 		max_slots = core_num_brps;
 		break;
 	case AARCH64_DBG_REG_WCR:
-		slots = __get_cpu_var(wp_on_reg);
+		slots = this_cpu_ptr(wp_on_reg);
 		max_slots = core_num_wrps;
 		break;
 	default:
@@ -546,7 +587,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
 	struct debug_info *debug_info;
 	struct arch_hw_breakpoint_ctrl ctrl;
 
-	slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
+	slots = this_cpu_ptr(bp_on_reg);
 	addr = instruction_pointer(regs);
 	debug_info = &current->thread.debug;
 
@@ -596,7 +637,7 @@ unlock:
 			user_enable_single_step(current);
 	} else {
 		toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL1, 0);
-		kernel_step = &__get_cpu_var(stepping_kernel_bp);
+		kernel_step = this_cpu_ptr(&stepping_kernel_bp);
 
 		if (*kernel_step != ARM_KERNEL_STEP_NONE)
 			return 0;
@@ -623,7 +664,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
 	struct arch_hw_breakpoint *info;
 	struct arch_hw_breakpoint_ctrl ctrl;
 
-	slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
+	slots = this_cpu_ptr(wp_on_reg);
 	debug_info = &current->thread.debug;
 
 	for (i = 0; i < core_num_wrps; ++i) {
@@ -698,7 +739,7 @@ unlock:
 			user_enable_single_step(current);
 	} else {
 		toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL1, 0);
-		kernel_step = &__get_cpu_var(stepping_kernel_bp);
+		kernel_step = this_cpu_ptr(&stepping_kernel_bp);
 
 		if (*kernel_step != ARM_KERNEL_STEP_NONE)
 			return 0;
@@ -722,7 +763,7 @@ int reinstall_suspended_bps(struct pt_regs *regs)
 	struct debug_info *debug_info = &current->thread.debug;
 	int handled_exception = 0, *kernel_step;
 
-	kernel_step = &__get_cpu_var(stepping_kernel_bp);
+	kernel_step = this_cpu_ptr(&stepping_kernel_bp);
 
 	/*
 	 * Called from single-step exception handler.
@@ -806,18 +847,36 @@ void hw_breakpoint_thread_switch(struct task_struct *next)
 /*
  * CPU initialisation.
  */
-static void reset_ctrl_regs(void *unused)
+static void hw_breakpoint_reset(void *unused)
 {
 	int i;
-
-	for (i = 0; i < core_num_brps; ++i) {
-		write_wb_reg(AARCH64_DBG_REG_BCR, i, 0UL);
-		write_wb_reg(AARCH64_DBG_REG_BVR, i, 0UL);
+	struct perf_event **slots;
+	/*
+	 * When a CPU goes through cold-boot, it does not have any installed
+	 * slot, so it is safe to share the same function for restoring and
+	 * resetting breakpoints; when a CPU is hotplugged in, it goes
+	 * through the slots, which are all empty, hence it just resets control
+	 * and value for debug registers.
+	 * When this function is triggered on warm-boot through a CPU PM
+	 * notifier some slots might be initialized; if so they are
+	 * reprogrammed according to the debug slots content.
+	 */
+	for (slots = this_cpu_ptr(bp_on_reg), i = 0; i < core_num_brps; ++i) {
+		if (slots[i]) {
+			hw_breakpoint_control(slots[i], HW_BREAKPOINT_RESTORE);
+		} else {
+			write_wb_reg(AARCH64_DBG_REG_BCR, i, 0UL);
+			write_wb_reg(AARCH64_DBG_REG_BVR, i, 0UL);
+		}
 	}
 
-	for (i = 0; i < core_num_wrps; ++i) {
-		write_wb_reg(AARCH64_DBG_REG_WCR, i, 0UL);
-		write_wb_reg(AARCH64_DBG_REG_WVR, i, 0UL);
+	for (slots = this_cpu_ptr(wp_on_reg), i = 0; i < core_num_wrps; ++i) {
+		if (slots[i]) {
+			hw_breakpoint_control(slots[i], HW_BREAKPOINT_RESTORE);
+		} else {
+			write_wb_reg(AARCH64_DBG_REG_WCR, i, 0UL);
+			write_wb_reg(AARCH64_DBG_REG_WVR, i, 0UL);
+		}
 	}
 }
 
@@ -827,7 +886,7 @@ static int hw_breakpoint_reset_notify(struct notifier_block *self,
 {
 	int cpu = (long)hcpu;
 	if (action == CPU_ONLINE)
-		smp_call_function_single(cpu, reset_ctrl_regs, NULL, 1);
+		smp_call_function_single(cpu, hw_breakpoint_reset, NULL, 1);
 	return NOTIFY_OK;
 }
 
@@ -835,6 +894,14 @@ static struct notifier_block hw_breakpoint_reset_nb = {
 	.notifier_call = hw_breakpoint_reset_notify,
 };
 
+#ifdef CONFIG_ARM64_CPU_SUSPEND
+extern void cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *));
+#else
+static inline void cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *))
+{
+}
+#endif
+
 /*
  * One-time initialisation.
  */
@@ -846,12 +913,14 @@ static int __init arch_hw_breakpoint_init(void)
 	pr_info("found %d breakpoint and %d watchpoint registers.\n",
 		core_num_brps, core_num_wrps);
 
+	cpu_notifier_register_begin();
+
 	/*
 	 * Reset the breakpoint resources. We assume that a halting
 	 * debugger will leave the world in a nice state for us.
 	 */
-	smp_call_function(reset_ctrl_regs, NULL, 1);
-	reset_ctrl_regs(NULL);
+	smp_call_function(hw_breakpoint_reset, NULL, 1);
+	hw_breakpoint_reset(NULL);
 
 	/* Register debug fault handlers. */
 	hook_debug_fault_code(DBG_ESR_EVT_HWBP, breakpoint_handler, SIGTRAP,
@@ -860,7 +929,12 @@ static int __init arch_hw_breakpoint_init(void)
 			      TRAP_HWBKPT, "hw-watchpoint handler");
 
 	/* Register hotplug notifier. */
-	register_cpu_notifier(&hw_breakpoint_reset_nb);
+	__register_cpu_notifier(&hw_breakpoint_reset_nb);
+
+	cpu_notifier_register_done();
+
+	/* Register cpu_suspend hw breakpoint restore hook */
+	cpu_suspend_set_dbg_restorer(hw_breakpoint_reset);
 
 	return 0;
 }
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
new file mode 100644
index 00000000000..92f36835486
--- /dev/null
+++ b/arch/arm64/kernel/insn.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (C) 2013 Huawei Ltd.
+ * Author: Jiang Liu <liuj97@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/stop_machine.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/insn.h>
+
+static int aarch64_insn_encoding_class[] = {
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_REG,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_FPSIMD,
+	AARCH64_INSN_CLS_DP_IMM,
+	AARCH64_INSN_CLS_DP_IMM,
+	AARCH64_INSN_CLS_BR_SYS,
+	AARCH64_INSN_CLS_BR_SYS,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_REG,
+	AARCH64_INSN_CLS_LDST,
+	AARCH64_INSN_CLS_DP_FPSIMD,
+};
+
+enum aarch64_insn_encoding_class __kprobes aarch64_get_insn_class(u32 insn)
+{
+	return aarch64_insn_encoding_class[(insn >> 25) & 0xf];
+}
+
+/* NOP is an alias of HINT */
+bool __kprobes aarch64_insn_is_nop(u32 insn)
+{
+	if (!aarch64_insn_is_hint(insn))
+		return false;
+
+	switch (insn & 0xFE0) {
+	case AARCH64_INSN_HINT_YIELD:
+	case AARCH64_INSN_HINT_WFE:
+	case AARCH64_INSN_HINT_WFI:
+	case AARCH64_INSN_HINT_SEV:
+	case AARCH64_INSN_HINT_SEVL:
+		return false;
+	default:
+		return true;
+	}
+}
+
+/*
+ * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always
+ * little-endian.
+ */
+int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
+{
+	int ret;
+	u32 val;
+
+	ret = probe_kernel_read(&val, addr, AARCH64_INSN_SIZE);
+	if (!ret)
+		*insnp = le32_to_cpu(val);
+
+	return ret;
+}
+
+int __kprobes aarch64_insn_write(void *addr, u32 insn)
+{
+	insn = cpu_to_le32(insn);
+	return probe_kernel_write(addr, &insn, AARCH64_INSN_SIZE);
+}
+
+static bool __kprobes __aarch64_insn_hotpatch_safe(u32 insn)
+{
+	if (aarch64_get_insn_class(insn) != AARCH64_INSN_CLS_BR_SYS)
+		return false;
+
+	return	aarch64_insn_is_b(insn) ||
+		aarch64_insn_is_bl(insn) ||
+		aarch64_insn_is_svc(insn) ||
+		aarch64_insn_is_hvc(insn) ||
+		aarch64_insn_is_smc(insn) ||
+		aarch64_insn_is_brk(insn) ||
+		aarch64_insn_is_nop(insn);
+}
+
+/*
+ * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a
+ * Section B2.6.5 "Concurrent modification and execution of instructions":
+ * Concurrent modification and execution of instructions can lead to the
+ * resulting instruction performing any behavior that can be achieved by
+ * executing any sequence of instructions that can be executed from the
+ * same Exception level, except where the instruction before modification
+ * and the instruction after modification is a B, BL, NOP, BKPT, SVC, HVC,
+ * or SMC instruction.
+ */
+bool __kprobes aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn)
+{
+	return __aarch64_insn_hotpatch_safe(old_insn) &&
+	       __aarch64_insn_hotpatch_safe(new_insn);
+}
+
+int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
+{
+	u32 *tp = addr;
+	int ret;
+
+	/* A64 instructions must be word aligned */
+	if ((uintptr_t)tp & 0x3)
+		return -EINVAL;
+
+	ret = aarch64_insn_write(tp, insn);
+	if (ret == 0)
+		flush_icache_range((uintptr_t)tp,
+				   (uintptr_t)tp + AARCH64_INSN_SIZE);
+
+	return ret;
+}
+
+struct aarch64_insn_patch {
+	void		**text_addrs;
+	u32		*new_insns;
+	int		insn_cnt;
+	atomic_t	cpu_count;
+};
+
+static int __kprobes aarch64_insn_patch_text_cb(void *arg)
+{
+	int i, ret = 0;
+	struct aarch64_insn_patch *pp = arg;
+
+	/* The first CPU becomes master */
+	if (atomic_inc_return(&pp->cpu_count) == 1) {
+		for (i = 0; ret == 0 && i < pp->insn_cnt; i++)
+			ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i],
+							     pp->new_insns[i]);
+		/*
+		 * aarch64_insn_patch_text_nosync() calls flush_icache_range(),
+		 * which ends with "dsb; isb" pair guaranteeing global
+		 * visibility.
+		 */
+		atomic_set(&pp->cpu_count, -1);
+	} else {
+		while (atomic_read(&pp->cpu_count) != -1)
+			cpu_relax();
+		isb();
+	}
+
+	return ret;
+}
+
+int __kprobes aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt)
+{
+	struct aarch64_insn_patch patch = {
+		.text_addrs = addrs,
+		.new_insns = insns,
+		.insn_cnt = cnt,
+		.cpu_count = ATOMIC_INIT(0),
+	};
+
+	if (cnt <= 0)
+		return -EINVAL;
+
+	return stop_machine(aarch64_insn_patch_text_cb, &patch,
+			    cpu_online_mask);
+}
+
+int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt)
+{
+	int ret;
+	u32 insn;
+
+	/* Unsafe to patch multiple instructions without synchronizaiton */
+	if (cnt == 1) {
+		ret = aarch64_insn_read(addrs[0], &insn);
+		if (ret)
+			return ret;
+
+		if (aarch64_insn_hotpatch_safe(insn, insns[0])) {
+			/*
+			 * ARMv8 architecture doesn't guarantee all CPUs see
+			 * the new instruction after returning from function
+			 * aarch64_insn_patch_text_nosync(). So send IPIs to
+			 * all other CPUs to achieve instruction
+			 * synchronization.
+			 */
+			ret = aarch64_insn_patch_text_nosync(addrs[0], insns[0]);
+			kick_all_cpus_sync();
+			return ret;
+		}
+	}
+
+	return aarch64_insn_patch_text_sync(addrs, insns, cnt);
+}
+
+u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
+				  u32 insn, u64 imm)
+{
+	u32 immlo, immhi, lomask, himask, mask;
+	int shift;
+
+	switch (type) {
+	case AARCH64_INSN_IMM_ADR:
+		lomask = 0x3;
+		himask = 0x7ffff;
+		immlo = imm & lomask;
+		imm >>= 2;
+		immhi = imm & himask;
+		imm = (immlo << 24) | (immhi);
+		mask = (lomask << 24) | (himask);
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_26:
+		mask = BIT(26) - 1;
+		shift = 0;
+		break;
+	case AARCH64_INSN_IMM_19:
+		mask = BIT(19) - 1;
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_16:
+		mask = BIT(16) - 1;
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_14:
+		mask = BIT(14) - 1;
+		shift = 5;
+		break;
+	case AARCH64_INSN_IMM_12:
+		mask = BIT(12) - 1;
+		shift = 10;
+		break;
+	case AARCH64_INSN_IMM_9:
+		mask = BIT(9) - 1;
+		shift = 12;
+		break;
+	default:
+		pr_err("aarch64_insn_encode_immediate: unknown immediate encoding %d\n",
+			type);
+		return 0;
+	}
+
+	/* Update the immediate field. */
+	insn &= ~(mask << shift);
+	insn |= (imm & mask) << shift;
+
+	return insn;
+}
+
+u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr,
+					  enum aarch64_insn_branch_type type)
+{
+	u32 insn;
+	long offset;
+
+	/*
+	 * PC: A 64-bit Program Counter holding the address of the current
+	 * instruction. A64 instructions must be word-aligned.
+	 */
+	BUG_ON((pc & 0x3) || (addr & 0x3));
+
+	/*
+	 * B/BL support [-128M, 128M) offset
+	 * ARM64 virtual address arrangement guarantees all kernel and module
+	 * texts are within +/-128M.
+	 */
+	offset = ((long)addr - (long)pc);
+	BUG_ON(offset < -SZ_128M || offset >= SZ_128M);
+
+	if (type == AARCH64_INSN_BRANCH_LINK)
+		insn = aarch64_insn_get_bl_value();
+	else
+		insn = aarch64_insn_get_b_value();
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
+					     offset >> 2);
+}
+
+u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_op op)
+{
+	return aarch64_insn_get_hint_value() | op;
+}
+
+u32 __kprobes aarch64_insn_gen_nop(void)
+{
+	return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP);
+}
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index ecb3354292e..0f08dfd69eb 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -81,3 +81,68 @@ void __init init_IRQ(void)
 	if (!handle_arch_irq)
 		panic("No interrupt controller found.");
 }
+
+#ifdef CONFIG_HOTPLUG_CPU
+static bool migrate_one_irq(struct irq_desc *desc)
+{
+	struct irq_data *d = irq_desc_get_irq_data(desc);
+	const struct cpumask *affinity = d->affinity;
+	struct irq_chip *c;
+	bool ret = false;
+
+	/*
+	 * If this is a per-CPU interrupt, or the affinity does not
+	 * include this CPU, then we have nothing to do.
+	 */
+	if (irqd_is_per_cpu(d) || !cpumask_test_cpu(smp_processor_id(), affinity))
+		return false;
+
+	if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids)
+		ret = true;
+
+	/*
+	 * when using forced irq_set_affinity we must ensure that the cpu
+	 * being offlined is not present in the affinity mask, it may be
+	 * selected as the target CPU otherwise
+	 */
+	affinity = cpu_online_mask;
+	c = irq_data_get_irq_chip(d);
+	if (!c->irq_set_affinity)
+		pr_debug("IRQ%u: unable to set affinity\n", d->irq);
+	else if (c->irq_set_affinity(d, affinity, true) == IRQ_SET_MASK_OK && ret)
+		cpumask_copy(d->affinity, affinity);
+
+	return ret;
+}
+
+/*
+ * The current CPU has been marked offline.  Migrate IRQs off this CPU.
+ * If the affinity settings do not allow other CPUs, force them onto any
+ * available CPU.
+ *
+ * Note: we must iterate over all IRQs, whether they have an attached
+ * action structure or not, as we need to get chained interrupts too.
+ */
+void migrate_irqs(void)
+{
+	unsigned int i;
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	for_each_irq_desc(i, desc) {
+		bool affinity_broken;
+
+		raw_spin_lock(&desc->lock);
+		affinity_broken = migrate_one_irq(desc);
+		raw_spin_unlock(&desc->lock);
+
+		if (affinity_broken)
+			pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n",
+					    i, smp_processor_id());
+	}
+
+	local_irq_restore(flags);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
new file mode 100644
index 00000000000..263a166291f
--- /dev/null
+++ b/arch/arm64/kernel/jump_label.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2013 Huawei Ltd.
+ * Author: Jiang Liu <liuj97@gmail.com>
+ *
+ * Based on arch/arm/kernel/jump_label.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/kernel.h>
+#include <linux/jump_label.h>
+#include <asm/insn.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+static void __arch_jump_label_transform(struct jump_entry *entry,
+					enum jump_label_type type,
+					bool is_static)
+{
+	void *addr = (void *)entry->code;
+	u32 insn;
+
+	if (type == JUMP_LABEL_ENABLE) {
+		insn = aarch64_insn_gen_branch_imm(entry->code,
+						   entry->target,
+						   AARCH64_INSN_BRANCH_NOLINK);
+	} else {
+		insn = aarch64_insn_gen_nop();
+	}
+
+	if (is_static)
+		aarch64_insn_patch_text_nosync(addr, insn);
+	else
+		aarch64_insn_patch_text(&addr, &insn, 1);
+}
+
+void arch_jump_label_transform(struct jump_entry *entry,
+			       enum jump_label_type type)
+{
+	__arch_jump_label_transform(entry, type, false);
+}
+
+void arch_jump_label_transform_static(struct jump_entry *entry,
+				      enum jump_label_type type)
+{
+	__arch_jump_label_transform(entry, type, true);
+}
+
+#endif	/* HAVE_JUMP_LABEL */
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
new file mode 100644
index 00000000000..75c9cf1aafe
--- /dev/null
+++ b/arch/arm64/kernel/kgdb.c
@@ -0,0 +1,336 @@
+/*
+ * AArch64 KGDB support
+ *
+ * Based on arch/arm/kernel/kgdb.c
+ *
+ * Copyright (C) 2013 Cavium Inc.
+ * Author: Vijaya Kumar K <vijaya.kumar@caviumnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/irq.h>
+#include <linux/kdebug.h>
+#include <linux/kgdb.h>
+#include <asm/traps.h>
+
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
+	{ "x0", 8, offsetof(struct pt_regs, regs[0])},
+	{ "x1", 8, offsetof(struct pt_regs, regs[1])},
+	{ "x2", 8, offsetof(struct pt_regs, regs[2])},
+	{ "x3", 8, offsetof(struct pt_regs, regs[3])},
+	{ "x4", 8, offsetof(struct pt_regs, regs[4])},
+	{ "x5", 8, offsetof(struct pt_regs, regs[5])},
+	{ "x6", 8, offsetof(struct pt_regs, regs[6])},
+	{ "x7", 8, offsetof(struct pt_regs, regs[7])},
+	{ "x8", 8, offsetof(struct pt_regs, regs[8])},
+	{ "x9", 8, offsetof(struct pt_regs, regs[9])},
+	{ "x10", 8, offsetof(struct pt_regs, regs[10])},
+	{ "x11", 8, offsetof(struct pt_regs, regs[11])},
+	{ "x12", 8, offsetof(struct pt_regs, regs[12])},
+	{ "x13", 8, offsetof(struct pt_regs, regs[13])},
+	{ "x14", 8, offsetof(struct pt_regs, regs[14])},
+	{ "x15", 8, offsetof(struct pt_regs, regs[15])},
+	{ "x16", 8, offsetof(struct pt_regs, regs[16])},
+	{ "x17", 8, offsetof(struct pt_regs, regs[17])},
+	{ "x18", 8, offsetof(struct pt_regs, regs[18])},
+	{ "x19", 8, offsetof(struct pt_regs, regs[19])},
+	{ "x20", 8, offsetof(struct pt_regs, regs[20])},
+	{ "x21", 8, offsetof(struct pt_regs, regs[21])},
+	{ "x22", 8, offsetof(struct pt_regs, regs[22])},
+	{ "x23", 8, offsetof(struct pt_regs, regs[23])},
+	{ "x24", 8, offsetof(struct pt_regs, regs[24])},
+	{ "x25", 8, offsetof(struct pt_regs, regs[25])},
+	{ "x26", 8, offsetof(struct pt_regs, regs[26])},
+	{ "x27", 8, offsetof(struct pt_regs, regs[27])},
+	{ "x28", 8, offsetof(struct pt_regs, regs[28])},
+	{ "x29", 8, offsetof(struct pt_regs, regs[29])},
+	{ "x30", 8, offsetof(struct pt_regs, regs[30])},
+	{ "sp", 8, offsetof(struct pt_regs, sp)},
+	{ "pc", 8, offsetof(struct pt_regs, pc)},
+	{ "pstate", 8, offsetof(struct pt_regs, pstate)},
+	{ "v0", 16, -1 },
+	{ "v1", 16, -1 },
+	{ "v2", 16, -1 },
+	{ "v3", 16, -1 },
+	{ "v4", 16, -1 },
+	{ "v5", 16, -1 },
+	{ "v6", 16, -1 },
+	{ "v7", 16, -1 },
+	{ "v8", 16, -1 },
+	{ "v9", 16, -1 },
+	{ "v10", 16, -1 },
+	{ "v11", 16, -1 },
+	{ "v12", 16, -1 },
+	{ "v13", 16, -1 },
+	{ "v14", 16, -1 },
+	{ "v15", 16, -1 },
+	{ "v16", 16, -1 },
+	{ "v17", 16, -1 },
+	{ "v18", 16, -1 },
+	{ "v19", 16, -1 },
+	{ "v20", 16, -1 },
+	{ "v21", 16, -1 },
+	{ "v22", 16, -1 },
+	{ "v23", 16, -1 },
+	{ "v24", 16, -1 },
+	{ "v25", 16, -1 },
+	{ "v26", 16, -1 },
+	{ "v27", 16, -1 },
+	{ "v28", 16, -1 },
+	{ "v29", 16, -1 },
+	{ "v30", 16, -1 },
+	{ "v31", 16, -1 },
+	{ "fpsr", 4, -1 },
+	{ "fpcr", 4, -1 },
+};
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return NULL;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+		       dbg_reg_def[regno].size);
+	else
+		memset(mem, 0, dbg_reg_def[regno].size);
+	return dbg_reg_def[regno].name;
+}
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return -EINVAL;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+		       dbg_reg_def[regno].size);
+	return 0;
+}
+
+void
+sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task)
+{
+	struct pt_regs *thread_regs;
+
+	/* Initialize to zero */
+	memset((char *)gdb_regs, 0, NUMREGBYTES);
+	thread_regs = task_pt_regs(task);
+	memcpy((void *)gdb_regs, (void *)thread_regs->regs, GP_REG_BYTES);
+}
+
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->pc = pc;
+}
+
+static int compiled_break;
+
+static void kgdb_arch_update_addr(struct pt_regs *regs,
+				char *remcom_in_buffer)
+{
+	unsigned long addr;
+	char *ptr;
+
+	ptr = &remcom_in_buffer[1];
+	if (kgdb_hex2long(&ptr, &addr))
+		kgdb_arch_set_pc(regs, addr);
+	else if (compiled_break == 1)
+		kgdb_arch_set_pc(regs, regs->pc + 4);
+
+	compiled_break = 0;
+}
+
+int kgdb_arch_handle_exception(int exception_vector, int signo,
+			       int err_code, char *remcom_in_buffer,
+			       char *remcom_out_buffer,
+			       struct pt_regs *linux_regs)
+{
+	int err;
+
+	switch (remcom_in_buffer[0]) {
+	case 'D':
+	case 'k':
+		/*
+		 * Packet D (Detach), k (kill). No special handling
+		 * is required here. Handle same as c packet.
+		 */
+	case 'c':
+		/*
+		 * Packet c (Continue) to continue executing.
+		 * Set pc to required address.
+		 * Try to read optional parameter and set pc.
+		 * If this was a compiled breakpoint, we need to move
+		 * to the next instruction else we will just breakpoint
+		 * over and over again.
+		 */
+		kgdb_arch_update_addr(linux_regs, remcom_in_buffer);
+		atomic_set(&kgdb_cpu_doing_single_step, -1);
+		kgdb_single_step =  0;
+
+		/*
+		 * Received continue command, disable single step
+		 */
+		if (kernel_active_single_step())
+			kernel_disable_single_step();
+
+		err = 0;
+		break;
+	case 's':
+		/*
+		 * Update step address value with address passed
+		 * with step packet.
+		 * On debug exception return PC is copied to ELR
+		 * So just update PC.
+		 * If no step address is passed, resume from the address
+		 * pointed by PC. Do not update PC
+		 */
+		kgdb_arch_update_addr(linux_regs, remcom_in_buffer);
+		atomic_set(&kgdb_cpu_doing_single_step, raw_smp_processor_id());
+		kgdb_single_step =  1;
+
+		/*
+		 * Enable single step handling
+		 */
+		if (!kernel_active_single_step())
+			kernel_enable_single_step(linux_regs);
+		err = 0;
+		break;
+	default:
+		err = -1;
+	}
+	return err;
+}
+
+static int kgdb_brk_fn(struct pt_regs *regs, unsigned int esr)
+{
+	kgdb_handle_exception(1, SIGTRAP, 0, regs);
+	return 0;
+}
+
+static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr)
+{
+	compiled_break = 1;
+	kgdb_handle_exception(1, SIGTRAP, 0, regs);
+
+	return 0;
+}
+
+static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr)
+{
+	kgdb_handle_exception(1, SIGTRAP, 0, regs);
+	return 0;
+}
+
+static struct break_hook kgdb_brkpt_hook = {
+	.esr_mask	= 0xffffffff,
+	.esr_val	= DBG_ESR_VAL_BRK(KGDB_DYN_DGB_BRK_IMM),
+	.fn		= kgdb_brk_fn
+};
+
+static struct break_hook kgdb_compiled_brkpt_hook = {
+	.esr_mask	= 0xffffffff,
+	.esr_val	= DBG_ESR_VAL_BRK(KDBG_COMPILED_DBG_BRK_IMM),
+	.fn		= kgdb_compiled_brk_fn
+};
+
+static struct step_hook kgdb_step_hook = {
+	.fn		= kgdb_step_brk_fn
+};
+
+static void kgdb_call_nmi_hook(void *ignored)
+{
+	kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
+}
+
+void kgdb_roundup_cpus(unsigned long flags)
+{
+	local_irq_enable();
+	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
+	local_irq_disable();
+}
+
+static int __kgdb_notify(struct die_args *args, unsigned long cmd)
+{
+	struct pt_regs *regs = args->regs;
+
+	if (kgdb_handle_exception(1, args->signr, cmd, regs))
+		return NOTIFY_DONE;
+	return NOTIFY_STOP;
+}
+
+static int
+kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
+{
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+	ret = __kgdb_notify(ptr, cmd);
+	local_irq_restore(flags);
+
+	return ret;
+}
+
+static struct notifier_block kgdb_notifier = {
+	.notifier_call	= kgdb_notify,
+	/*
+	 * Want to be lowest priority
+	 */
+	.priority	= -INT_MAX,
+};
+
+/*
+ * kgdb_arch_init - Perform any architecture specific initalization.
+ * This function will handle the initalization of any architecture
+ * specific callbacks.
+ */
+int kgdb_arch_init(void)
+{
+	int ret = register_die_notifier(&kgdb_notifier);
+
+	if (ret != 0)
+		return ret;
+
+	register_break_hook(&kgdb_brkpt_hook);
+	register_break_hook(&kgdb_compiled_brkpt_hook);
+	register_step_hook(&kgdb_step_hook);
+	return 0;
+}
+
+/*
+ * kgdb_arch_exit - Perform any architecture specific uninitalization.
+ * This function will handle the uninitalization of any architecture
+ * specific callbacks, for dynamic registration and unregistration.
+ */
+void kgdb_arch_exit(void)
+{
+	unregister_break_hook(&kgdb_brkpt_hook);
+	unregister_break_hook(&kgdb_compiled_brkpt_hook);
+	unregister_step_hook(&kgdb_step_hook);
+	unregister_die_notifier(&kgdb_notifier);
+}
+
+/*
+ * ARM instructions are always in LE.
+ * Break instruction is encoded in LE format
+ */
+struct kgdb_arch arch_kgdb_ops = {
+	.gdb_bpt_instr = {
+		KGDB_DYN_BRK_INS_BYTE0,
+		KGDB_DYN_BRK_INS_BYTE1,
+		KGDB_DYN_BRK_INS_BYTE2,
+		KGDB_DYN_BRK_INS_BYTE3,
+	}
+};
diff --git a/arch/arm64/kernel/kuser32.S b/arch/arm64/kernel/kuser32.S
index 8b69ecb1d8b..7787208e8cc 100644
--- a/arch/arm64/kernel/kuser32.S
+++ b/arch/arm64/kernel/kuser32.S
@@ -27,6 +27,9 @@
  *
  * See Documentation/arm/kernel_user_helpers.txt for formal definitions.
  */
+
+#include <asm/unistd32.h>
+
 	.align	5
 	.globl	__kuser_helper_start
 __kuser_helper_start:
@@ -35,33 +38,32 @@ __kuser_cmpxchg64:			// 0xffff0f60
 	.inst	0xe92d00f0		//	push		{r4, r5, r6, r7}
 	.inst	0xe1c040d0		//	ldrd		r4, r5, [r0]
 	.inst	0xe1c160d0		//	ldrd		r6, r7, [r1]
-	.inst	0xf57ff05f		//	dmb		sy
 	.inst	0xe1b20f9f		// 1:	ldrexd		r0, r1, [r2]
 	.inst	0xe0303004		//	eors		r3, r0, r4
 	.inst	0x00313005		//	eoreqs		r3, r1, r5
-	.inst	0x01a23f96		//	strexdeq	r3, r6, [r2]
+	.inst	0x01a23e96		//	stlexdeq	r3, r6, [r2]
 	.inst	0x03330001		//	teqeq		r3, #1
 	.inst	0x0afffff9		//	beq		1b
-	.inst	0xf57ff05f		//	dmb		sy
+	.inst	0xf57ff05b		//	dmb		ish
 	.inst	0xe2730000		//	rsbs		r0, r3, #0
 	.inst	0xe8bd00f0		//	pop		{r4, r5, r6, r7}
 	.inst	0xe12fff1e		//	bx		lr
 
 	.align	5
 __kuser_memory_barrier:			// 0xffff0fa0
-	.inst	0xf57ff05f		//	dmb		sy
+	.inst	0xf57ff05b		//	dmb		ish
 	.inst	0xe12fff1e		//	bx		lr
 
 	.align	5
 __kuser_cmpxchg:			// 0xffff0fc0
-	.inst	0xf57ff05f		//	dmb		sy
 	.inst	0xe1923f9f		// 1:	ldrex		r3, [r2]
 	.inst	0xe0533000		//	subs		r3, r3, r0
-	.inst	0x01823f91		//	strexeq	r3, r1, [r2]
+	.inst	0x01823e91		//	stlexeq		r3, r1, [r2]
 	.inst	0x03330001		//	teqeq		r3, #1
 	.inst	0x0afffffa		//	beq		1b
+	.inst	0xf57ff05b		//	dmb		ish
 	.inst	0xe2730000		//	rsbs		r0, r3, #0
-	.inst	0xeaffffef		//	b		<__kuser_memory_barrier>
+	.inst	0xe12fff1e		//	bx		lr
 
 	.align	5
 __kuser_get_tls:			// 0xffff0fe0
@@ -75,3 +77,42 @@ __kuser_helper_version:			// 0xffff0ffc
 	.word	((__kuser_helper_end - __kuser_helper_start) >> 5)
 	.globl	__kuser_helper_end
 __kuser_helper_end:
+
+/*
+ * AArch32 sigreturn code
+ *
+ * For ARM syscalls, the syscall number has to be loaded into r7.
+ * We do not support an OABI userspace.
+ *
+ * For Thumb syscalls, we also pass the syscall number via r7. We therefore
+ * need two 16-bit instructions.
+ */
+	.globl __aarch32_sigret_code_start
+__aarch32_sigret_code_start:
+
+	/*
+	 * ARM Code
+	 */
+	.byte	__NR_compat_sigreturn, 0x70, 0xa0, 0xe3	// mov	r7, #__NR_compat_sigreturn
+	.byte	__NR_compat_sigreturn, 0x00, 0x00, 0xef	// svc	#__NR_compat_sigreturn
+
+	/*
+	 * Thumb code
+	 */
+	.byte	__NR_compat_sigreturn, 0x27			// svc	#__NR_compat_sigreturn
+	.byte	__NR_compat_sigreturn, 0xdf			// mov	r7, #__NR_compat_sigreturn
+
+	/*
+	 * ARM code
+	 */
+	.byte	__NR_compat_rt_sigreturn, 0x70, 0xa0, 0xe3	// mov	r7, #__NR_compat_rt_sigreturn
+	.byte	__NR_compat_rt_sigreturn, 0x00, 0x00, 0xef	// svc	#__NR_compat_rt_sigreturn
+
+	/*
+	 * Thumb code
+	 */
+	.byte	__NR_compat_rt_sigreturn, 0x27			// svc	#__NR_compat_rt_sigreturn
+	.byte	__NR_compat_rt_sigreturn, 0xdf			// mov	r7, #__NR_compat_rt_sigreturn
+
+        .globl __aarch32_sigret_code_end
+__aarch32_sigret_code_end:
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index ca0e3d55da9..1eb1cc95513 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -25,11 +25,15 @@
 #include <linux/mm.h>
 #include <linux/moduleloader.h>
 #include <linux/vmalloc.h>
+#include <asm/insn.h>
+
+#define	AARCH64_INSN_IMM_MOVNZ		AARCH64_INSN_IMM_MAX
+#define	AARCH64_INSN_IMM_MOVK		AARCH64_INSN_IMM_16
 
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL_EXEC, -1,
+				    GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 
@@ -94,25 +98,18 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len)
 	return 0;
 }
 
-enum aarch64_imm_type {
-	INSN_IMM_MOVNZ,
-	INSN_IMM_MOVK,
-	INSN_IMM_ADR,
-	INSN_IMM_26,
-	INSN_IMM_19,
-	INSN_IMM_16,
-	INSN_IMM_14,
-	INSN_IMM_12,
-	INSN_IMM_9,
-};
-
-static u32 encode_insn_immediate(enum aarch64_imm_type type, u32 insn, u64 imm)
+static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
+			   int lsb, enum aarch64_insn_imm_type imm_type)
 {
-	u32 immlo, immhi, lomask, himask, mask;
-	int shift;
+	u64 imm, limit = 0;
+	s64 sval;
+	u32 insn = le32_to_cpu(*(u32 *)place);
+
+	sval = do_reloc(op, place, val);
+	sval >>= lsb;
+	imm = sval & 0xffff;
 
-	switch (type) {
-	case INSN_IMM_MOVNZ:
+	if (imm_type == AARCH64_INSN_IMM_MOVNZ) {
 		/*
 		 * For signed MOVW relocations, we have to manipulate the
 		 * instruction encoding depending on whether or not the
@@ -131,70 +128,12 @@ static u32 encode_insn_immediate(enum aarch64_imm_type type, u32 insn, u64 imm)
 			 */
 			imm = ~imm;
 		}
-	case INSN_IMM_MOVK:
-		mask = BIT(16) - 1;
-		shift = 5;
-		break;
-	case INSN_IMM_ADR:
-		lomask = 0x3;
-		himask = 0x7ffff;
-		immlo = imm & lomask;
-		imm >>= 2;
-		immhi = imm & himask;
-		imm = (immlo << 24) | (immhi);
-		mask = (lomask << 24) | (himask);
-		shift = 5;
-		break;
-	case INSN_IMM_26:
-		mask = BIT(26) - 1;
-		shift = 0;
-		break;
-	case INSN_IMM_19:
-		mask = BIT(19) - 1;
-		shift = 5;
-		break;
-	case INSN_IMM_16:
-		mask = BIT(16) - 1;
-		shift = 5;
-		break;
-	case INSN_IMM_14:
-		mask = BIT(14) - 1;
-		shift = 5;
-		break;
-	case INSN_IMM_12:
-		mask = BIT(12) - 1;
-		shift = 10;
-		break;
-	case INSN_IMM_9:
-		mask = BIT(9) - 1;
-		shift = 12;
-		break;
-	default:
-		pr_err("encode_insn_immediate: unknown immediate encoding %d\n",
-			type);
-		return 0;
+		imm_type = AARCH64_INSN_IMM_MOVK;
 	}
 
-	/* Update the immediate field. */
-	insn &= ~(mask << shift);
-	insn |= (imm & mask) << shift;
-
-	return insn;
-}
-
-static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
-			   int lsb, enum aarch64_imm_type imm_type)
-{
-	u64 imm, limit = 0;
-	s64 sval;
-	u32 insn = *(u32 *)place;
-
-	sval = do_reloc(op, place, val);
-	sval >>= lsb;
-	imm = sval & 0xffff;
-
 	/* Update the instruction with the new encoding. */
-	*(u32 *)place = encode_insn_immediate(imm_type, insn, imm);
+	insn = aarch64_insn_encode_immediate(imm_type, insn, imm);
+	*(u32 *)place = cpu_to_le32(insn);
 
 	/* Shift out the immediate field. */
 	sval >>= 16;
@@ -203,9 +142,9 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
 	 * For unsigned immediates, the overflow check is straightforward.
 	 * For signed immediates, the sign bit is actually the bit past the
 	 * most significant bit of the field.
-	 * The INSN_IMM_16 immediate type is unsigned.
+	 * The AARCH64_INSN_IMM_16 immediate type is unsigned.
 	 */
-	if (imm_type != INSN_IMM_16) {
+	if (imm_type != AARCH64_INSN_IMM_16) {
 		sval++;
 		limit++;
 	}
@@ -218,11 +157,11 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
 }
 
 static int reloc_insn_imm(enum aarch64_reloc_op op, void *place, u64 val,
-			  int lsb, int len, enum aarch64_imm_type imm_type)
+			  int lsb, int len, enum aarch64_insn_imm_type imm_type)
 {
 	u64 imm, imm_mask;
 	s64 sval;
-	u32 insn = *(u32 *)place;
+	u32 insn = le32_to_cpu(*(u32 *)place);
 
 	/* Calculate the relocation value. */
 	sval = do_reloc(op, place, val);
@@ -233,7 +172,8 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, void *place, u64 val,
 	imm = sval & imm_mask;
 
 	/* Update the instruction's immediate field. */
-	*(u32 *)place = encode_insn_immediate(imm_type, insn, imm);
+	insn = aarch64_insn_encode_immediate(imm_type, insn, imm);
+	*(u32 *)place = cpu_to_le32(insn);
 
 	/*
 	 * Extract the upper value bits (including the sign bit) and
@@ -315,125 +255,125 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 			overflow_check = false;
 		case R_AARCH64_MOVW_UABS_G0:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
-					      INSN_IMM_16);
+					      AARCH64_INSN_IMM_16);
 			break;
 		case R_AARCH64_MOVW_UABS_G1_NC:
 			overflow_check = false;
 		case R_AARCH64_MOVW_UABS_G1:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16,
-					      INSN_IMM_16);
+					      AARCH64_INSN_IMM_16);
 			break;
 		case R_AARCH64_MOVW_UABS_G2_NC:
 			overflow_check = false;
 		case R_AARCH64_MOVW_UABS_G2:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32,
-					      INSN_IMM_16);
+					      AARCH64_INSN_IMM_16);
 			break;
 		case R_AARCH64_MOVW_UABS_G3:
 			/* We're using the top bits so we can't overflow. */
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48,
-					      INSN_IMM_16);
+					      AARCH64_INSN_IMM_16);
 			break;
 		case R_AARCH64_MOVW_SABS_G0:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
-					      INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ);
 			break;
 		case R_AARCH64_MOVW_SABS_G1:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16,
-					      INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ);
 			break;
 		case R_AARCH64_MOVW_SABS_G2:
 			ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32,
-					      INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ);
 			break;
 		case R_AARCH64_MOVW_PREL_G0_NC:
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
-					      INSN_IMM_MOVK);
+					      AARCH64_INSN_IMM_MOVK);
 			break;
 		case R_AARCH64_MOVW_PREL_G0:
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
-					      INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ);
 			break;
 		case R_AARCH64_MOVW_PREL_G1_NC:
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
-					      INSN_IMM_MOVK);
+					      AARCH64_INSN_IMM_MOVK);
 			break;
 		case R_AARCH64_MOVW_PREL_G1:
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
-					      INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ);
 			break;
 		case R_AARCH64_MOVW_PREL_G2_NC:
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
-					      INSN_IMM_MOVK);
+					      AARCH64_INSN_IMM_MOVK);
 			break;
 		case R_AARCH64_MOVW_PREL_G2:
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
-					      INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ);
 			break;
 		case R_AARCH64_MOVW_PREL_G3:
 			/* We're using the top bits so we can't overflow. */
 			overflow_check = false;
 			ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 48,
-					      INSN_IMM_MOVNZ);
+					      AARCH64_INSN_IMM_MOVNZ);
 			break;
 
 		/* Immediate instruction relocations. */
 		case R_AARCH64_LD_PREL_LO19:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19,
-					     INSN_IMM_19);
+					     AARCH64_INSN_IMM_19);
 			break;
 		case R_AARCH64_ADR_PREL_LO21:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21,
-					     INSN_IMM_ADR);
+					     AARCH64_INSN_IMM_ADR);
 			break;
 		case R_AARCH64_ADR_PREL_PG_HI21_NC:
 			overflow_check = false;
 		case R_AARCH64_ADR_PREL_PG_HI21:
 			ovf = reloc_insn_imm(RELOC_OP_PAGE, loc, val, 12, 21,
-					     INSN_IMM_ADR);
+					     AARCH64_INSN_IMM_ADR);
 			break;
 		case R_AARCH64_ADD_ABS_LO12_NC:
 		case R_AARCH64_LDST8_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 0, 12,
-					     INSN_IMM_12);
+					     AARCH64_INSN_IMM_12);
 			break;
 		case R_AARCH64_LDST16_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 1, 11,
-					     INSN_IMM_12);
+					     AARCH64_INSN_IMM_12);
 			break;
 		case R_AARCH64_LDST32_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 2, 10,
-					     INSN_IMM_12);
+					     AARCH64_INSN_IMM_12);
 			break;
 		case R_AARCH64_LDST64_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 3, 9,
-					     INSN_IMM_12);
+					     AARCH64_INSN_IMM_12);
 			break;
 		case R_AARCH64_LDST128_ABS_LO12_NC:
 			overflow_check = false;
 			ovf = reloc_insn_imm(RELOC_OP_ABS, loc, val, 4, 8,
-					     INSN_IMM_12);
+					     AARCH64_INSN_IMM_12);
 			break;
 		case R_AARCH64_TSTBR14:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 14,
-					     INSN_IMM_14);
+					     AARCH64_INSN_IMM_14);
 			break;
 		case R_AARCH64_CONDBR19:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 19,
-					     INSN_IMM_19);
+					     AARCH64_INSN_IMM_19);
 			break;
 		case R_AARCH64_JUMP26:
 		case R_AARCH64_CALL26:
 			ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26,
-					     INSN_IMM_26);
+					     AARCH64_INSN_IMM_26);
 			break;
 
 		default:
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index cea1594ff93..baf5afb7e6a 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -22,6 +22,7 @@
 
 #include <linux/bitmap.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/perf_event.h>
@@ -363,26 +364,53 @@ validate_group(struct perf_event *event)
 }
 
 static void
+armpmu_disable_percpu_irq(void *data)
+{
+	unsigned int irq = *(unsigned int *)data;
+	disable_percpu_irq(irq);
+}
+
+static void
 armpmu_release_hardware(struct arm_pmu *armpmu)
 {
-	int i, irq, irqs;
+	int irq;
+	unsigned int i, irqs;
 	struct platform_device *pmu_device = armpmu->plat_device;
 
 	irqs = min(pmu_device->num_resources, num_possible_cpus());
+	if (!irqs)
+		return;
 
-	for (i = 0; i < irqs; ++i) {
-		if (!cpumask_test_and_clear_cpu(i, &armpmu->active_irqs))
-			continue;
-		irq = platform_get_irq(pmu_device, i);
-		if (irq >= 0)
-			free_irq(irq, armpmu);
+	irq = platform_get_irq(pmu_device, 0);
+	if (irq <= 0)
+		return;
+
+	if (irq_is_percpu(irq)) {
+		on_each_cpu(armpmu_disable_percpu_irq, &irq, 1);
+		free_percpu_irq(irq, &cpu_hw_events);
+	} else {
+		for (i = 0; i < irqs; ++i) {
+			if (!cpumask_test_and_clear_cpu(i, &armpmu->active_irqs))
+				continue;
+			irq = platform_get_irq(pmu_device, i);
+			if (irq > 0)
+				free_irq(irq, armpmu);
+		}
 	}
 }
 
+static void
+armpmu_enable_percpu_irq(void *data)
+{
+	unsigned int irq = *(unsigned int *)data;
+	enable_percpu_irq(irq, IRQ_TYPE_NONE);
+}
+
 static int
 armpmu_reserve_hardware(struct arm_pmu *armpmu)
 {
-	int i, err, irq, irqs;
+	int err, irq;
+	unsigned int i, irqs;
 	struct platform_device *pmu_device = armpmu->plat_device;
 
 	if (!pmu_device) {
@@ -391,39 +419,59 @@ armpmu_reserve_hardware(struct arm_pmu *armpmu)
 	}
 
 	irqs = min(pmu_device->num_resources, num_possible_cpus());
-	if (irqs < 1) {
+	if (!irqs) {
 		pr_err("no irqs for PMUs defined\n");
 		return -ENODEV;
 	}
 
-	for (i = 0; i < irqs; ++i) {
-		err = 0;
-		irq = platform_get_irq(pmu_device, i);
-		if (irq < 0)
-			continue;
+	irq = platform_get_irq(pmu_device, 0);
+	if (irq <= 0) {
+		pr_err("failed to get valid irq for PMU device\n");
+		return -ENODEV;
+	}
 
-		/*
-		 * If we have a single PMU interrupt that we can't shift,
-		 * assume that we're running on a uniprocessor machine and
-		 * continue. Otherwise, continue without this interrupt.
-		 */
-		if (irq_set_affinity(irq, cpumask_of(i)) && irqs > 1) {
-			pr_warning("unable to set irq affinity (irq=%d, cpu=%u)\n",
-				    irq, i);
-			continue;
-		}
+	if (irq_is_percpu(irq)) {
+		err = request_percpu_irq(irq, armpmu->handle_irq,
+				"arm-pmu", &cpu_hw_events);
 
-		err = request_irq(irq, armpmu->handle_irq,
-				  IRQF_NOBALANCING,
-				  "arm-pmu", armpmu);
 		if (err) {
-			pr_err("unable to request IRQ%d for ARM PMU counters\n",
-				irq);
+			pr_err("unable to request percpu IRQ%d for ARM PMU counters\n",
+					irq);
 			armpmu_release_hardware(armpmu);
 			return err;
 		}
 
-		cpumask_set_cpu(i, &armpmu->active_irqs);
+		on_each_cpu(armpmu_enable_percpu_irq, &irq, 1);
+	} else {
+		for (i = 0; i < irqs; ++i) {
+			err = 0;
+			irq = platform_get_irq(pmu_device, i);
+			if (irq <= 0)
+				continue;
+
+			/*
+			 * If we have a single PMU interrupt that we can't shift,
+			 * assume that we're running on a uniprocessor machine and
+			 * continue. Otherwise, continue without this interrupt.
+			 */
+			if (irq_set_affinity(irq, cpumask_of(i)) && irqs > 1) {
+				pr_warning("unable to set irq affinity (irq=%d, cpu=%u)\n",
+						irq, i);
+				continue;
+			}
+
+			err = request_irq(irq, armpmu->handle_irq,
+					IRQF_NOBALANCING,
+					"arm-pmu", armpmu);
+			if (err) {
+				pr_err("unable to request IRQ%d for ARM PMU counters\n",
+						irq);
+				armpmu_release_hardware(armpmu);
+				return err;
+			}
+
+			cpumask_set_cpu(i, &armpmu->active_irqs);
+		}
 	}
 
 	return 0;
@@ -784,8 +832,8 @@ static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 /*
  * PMXEVTYPER: Event selection reg
  */
-#define	ARMV8_EVTYPE_MASK	0xc80000ff	/* Mask for writable bits */
-#define	ARMV8_EVTYPE_EVENT	0xff		/* Mask for EVENT bits */
+#define	ARMV8_EVTYPE_MASK	0xc80003ff	/* Mask for writable bits */
+#define	ARMV8_EVTYPE_EVENT	0x3ff		/* Mask for EVENT bits */
 
 /*
  * Event filters for PMUv3
@@ -1044,7 +1092,7 @@ static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev)
 	 */
 	regs = get_irq_regs();
 
-	cpuc = &__get_cpu_var(cpu_hw_events);
+	cpuc = this_cpu_ptr(&cpu_hw_events);
 	for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
 		struct perf_event *event = cpuc->events[idx];
 		struct hw_perf_event *hwc;
@@ -1175,7 +1223,8 @@ static void armv8pmu_reset(void *info)
 static int armv8_pmuv3_map_event(struct perf_event *event)
 {
 	return map_cpu_event(event, &armv8_pmuv3_perf_map,
-				&armv8_pmuv3_perf_cache_map, 0xFF);
+				&armv8_pmuv3_perf_cache_map,
+				ARMV8_EVTYPE_EVENT);
 }
 
 static struct arm_pmu armv8pmu = {
@@ -1257,7 +1306,7 @@ device_initcall(register_pmu_driver);
 
 static struct pmu_hw_events *armpmu_get_cpu_events(void)
 {
-	return &__get_cpu_var(cpu_hw_events);
+	return this_cpu_ptr(&cpu_hw_events);
 }
 
 static void __init cpu_pmu_init(struct arm_pmu *armpmu)
@@ -1299,8 +1348,8 @@ early_initcall(init_hw_perf_events);
  * Callchain handling code.
  */
 struct frame_tail {
-	struct frame_tail   __user *fp;
-	unsigned long	    lr;
+	struct frame_tail	__user *fp;
+	unsigned long		lr;
 } __attribute__((packed));
 
 /*
@@ -1337,22 +1386,84 @@ user_backtrace(struct frame_tail __user *tail,
 	return buftail.fp;
 }
 
+#ifdef CONFIG_COMPAT
+/*
+ * The registers we're interested in are at the end of the variable
+ * length saved register structure. The fp points at the end of this
+ * structure so the address of this struct is:
+ * (struct compat_frame_tail *)(xxx->fp)-1
+ *
+ * This code has been adapted from the ARM OProfile support.
+ */
+struct compat_frame_tail {
+	compat_uptr_t	fp; /* a (struct compat_frame_tail *) in compat mode */
+	u32		sp;
+	u32		lr;
+} __attribute__((packed));
+
+static struct compat_frame_tail __user *
+compat_user_backtrace(struct compat_frame_tail __user *tail,
+		      struct perf_callchain_entry *entry)
+{
+	struct compat_frame_tail buftail;
+	unsigned long err;
+
+	/* Also check accessibility of one struct frame_tail beyond */
+	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
+		return NULL;
+
+	pagefault_disable();
+	err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail));
+	pagefault_enable();
+
+	if (err)
+		return NULL;
+
+	perf_callchain_store(entry, buftail.lr);
+
+	/*
+	 * Frame pointers should strictly progress back up the stack
+	 * (towards higher addresses).
+	 */
+	if (tail + 1 >= (struct compat_frame_tail __user *)
+			compat_ptr(buftail.fp))
+		return NULL;
+
+	return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1;
+}
+#endif /* CONFIG_COMPAT */
+
 void perf_callchain_user(struct perf_callchain_entry *entry,
 			 struct pt_regs *regs)
 {
-	struct frame_tail __user *tail;
-
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
 		/* We don't support guest os callchain now */
 		return;
 	}
 
 	perf_callchain_store(entry, regs->pc);
-	tail = (struct frame_tail __user *)regs->regs[29];
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH &&
-	       tail && !((unsigned long)tail & 0xf))
-		tail = user_backtrace(tail, entry);
+	if (!compat_user_mode(regs)) {
+		/* AARCH64 mode */
+		struct frame_tail __user *tail;
+
+		tail = (struct frame_tail __user *)regs->regs[29];
+
+		while (entry->nr < PERF_MAX_STACK_DEPTH &&
+		       tail && !((unsigned long)tail & 0xf))
+			tail = user_backtrace(tail, entry);
+	} else {
+#ifdef CONFIG_COMPAT
+		/* AARCH32 compat mode */
+		struct compat_frame_tail __user *tail;
+
+		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
+
+		while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+			tail && !((unsigned long)tail & 0x3))
+			tail = compat_user_backtrace(tail, entry);
+#endif
+	}
 }
 
 /*
@@ -1380,6 +1491,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	frame.fp = regs->regs[29];
 	frame.sp = regs->sp;
 	frame.pc = regs->pc;
+
 	walk_stackframe(&frame, callchain_trace, entry);
 }
 
diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
new file mode 100644
index 00000000000..422ebd63b61
--- /dev/null
+++ b/arch/arm64/kernel/perf_regs.c
@@ -0,0 +1,46 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+
+#include <asm/compat.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+	if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_MAX))
+		return 0;
+
+	/*
+	 * Compat (i.e. 32 bit) mode:
+	 * - PC has been set in the pt_regs struct in kernel_entry,
+	 * - Handle SP and LR here.
+	 */
+	if (compat_user_mode(regs)) {
+		if ((u32)idx == PERF_REG_ARM64_SP)
+			return regs->compat_sp;
+		if ((u32)idx == PERF_REG_ARM64_LR)
+			return regs->compat_lr;
+	}
+
+	return regs->regs[idx];
+}
+
+#define REG_RESERVED (~((1ULL << PERF_REG_ARM64_MAX) - 1))
+
+int perf_reg_validate(u64 mask)
+{
+	if (!mask || mask & REG_RESERVED)
+		return -EINVAL;
+
+	return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+	if (is_compat_thread(task_thread_info(task)))
+		return PERF_SAMPLE_REGS_ABI_32;
+	else
+		return PERF_SAMPLE_REGS_ABI_64;
+}
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 7ae8a1f00c3..43b7c34f92c 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -20,6 +20,7 @@
 
 #include <stdarg.h>
 
+#include <linux/compat.h>
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -71,8 +72,17 @@ static void setup_restart(void)
 
 void soft_restart(unsigned long addr)
 {
+	typedef void (*phys_reset_t)(unsigned long);
+	phys_reset_t phys_reset;
+
 	setup_restart();
-	cpu_reset(addr);
+
+	/* Switch to the identity mapping */
+	phys_reset = (phys_reset_t)virt_to_phys(cpu_reset);
+	phys_reset(addr);
+
+	/* Should never get here */
+	BUG();
 }
 
 /*
@@ -84,11 +94,6 @@ EXPORT_SYMBOL_GPL(pm_power_off);
 void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
 EXPORT_SYMBOL_GPL(arm_pm_restart);
 
-void arch_cpu_idle_prepare(void)
-{
-	local_fiq_enable();
-}
-
 /*
  * This is our default idle handler.
  */
@@ -102,33 +107,69 @@ void arch_cpu_idle(void)
 	local_irq_enable();
 }
 
-void machine_shutdown(void)
+#ifdef CONFIG_HOTPLUG_CPU
+void arch_cpu_idle_dead(void)
 {
-#ifdef CONFIG_SMP
-	smp_send_stop();
+       cpu_die();
+}
 #endif
+
+/*
+ * Called by kexec, immediately prior to machine_kexec().
+ *
+ * This must completely disable all secondary CPUs; simply causing those CPUs
+ * to execute e.g. a RAM-based pin loop is not sufficient. This allows the
+ * kexec'd kernel to use any and all RAM as it sees fit, without having to
+ * avoid any code or data used by any SW CPU pin loop. The CPU hotplug
+ * functionality embodied in disable_nonboot_cpus() to achieve this.
+ */
+void machine_shutdown(void)
+{
+	disable_nonboot_cpus();
 }
 
+/*
+ * Halting simply requires that the secondary CPUs stop performing any
+ * activity (executing tasks, handling interrupts). smp_send_stop()
+ * achieves this.
+ */
 void machine_halt(void)
 {
-	machine_shutdown();
+	local_irq_disable();
+	smp_send_stop();
 	while (1);
 }
 
+/*
+ * Power-off simply requires that the secondary CPUs stop performing any
+ * activity (executing tasks, handling interrupts). smp_send_stop()
+ * achieves this. When the system power is turned off, it will take all CPUs
+ * with it.
+ */
 void machine_power_off(void)
 {
-	machine_shutdown();
+	local_irq_disable();
+	smp_send_stop();
 	if (pm_power_off)
 		pm_power_off();
 }
 
+/*
+ * Restart requires that the secondary CPUs stop performing any activity
+ * while the primary CPU resets the system. Systems with a single CPU can
+ * use soft_restart() as their machine descriptor's .restart hook, since that
+ * will cause the only available CPU to reset. Systems with multiple CPUs must
+ * provide a HW restart implementation, to ensure that all CPUs reset at once.
+ * This is required so that any code running after reset on the primary CPU
+ * doesn't have to co-ordinate with other CPUs to ensure they aren't still
+ * executing pre-reset code, and using RAM that the primary CPU's code wishes
+ * to use. Implementing such co-ordination would be essentially impossible.
+ */
 void machine_restart(char *cmd)
 {
-	machine_shutdown();
-
 	/* Disable interrupts first */
 	local_irq_disable();
-	local_fiq_disable();
+	smp_send_stop();
 
 	/* Now call the architecture specific reboot code. */
 	if (arm_pm_restart)
@@ -195,7 +236,7 @@ void release_thread(struct task_struct *dead_task)
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	fpsimd_save_state(&current->thread.fpsimd_state);
+	fpsimd_preserve_current_state();
 	*dst = *src;
 	return 0;
 }
@@ -290,7 +331,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	 * Complete any pending TLB or cache maintenance on this CPU in case
 	 * the thread migrates to a different CPU.
 	 */
-	dsb();
+	dsb(ish);
 
 	/* the actual thread switch */
 	last = cpu_switch_to(prev, next);
@@ -301,6 +342,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 unsigned long get_wchan(struct task_struct *p)
 {
 	struct stackframe frame;
+	unsigned long stack_page;
 	int count = 0;
 	if (!p || p == current || p->state == TASK_RUNNING)
 		return 0;
@@ -308,9 +350,11 @@ unsigned long get_wchan(struct task_struct *p)
 	frame.fp = thread_saved_fp(p);
 	frame.sp = thread_saved_sp(p);
 	frame.pc = thread_saved_pc(p);
+	stack_page = (unsigned long)task_stack_page(p);
 	do {
-		int ret = unwind_frame(&frame);
-		if (ret < 0)
+		if (frame.sp < stack_page ||
+		    frame.sp >= stack_page + THREAD_SIZE ||
+		    unwind_frame(&frame))
 			return 0;
 		if (!in_sched_functions(frame.pc))
 			return frame.pc;
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c
index 14f73c445ff..9e9798f9117 100644
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -17,61 +17,80 @@
 
 #include <linux/init.h>
 #include <linux/of.h>
+#include <linux/smp.h>
+#include <linux/reboot.h>
+#include <linux/pm.h>
+#include <linux/delay.h>
+#include <uapi/linux/psci.h>
 
 #include <asm/compiler.h>
+#include <asm/cpu_ops.h>
 #include <asm/errno.h>
 #include <asm/psci.h>
+#include <asm/smp_plat.h>
+#include <asm/system_misc.h>
 
-struct psci_operations psci_ops;
+#define PSCI_POWER_STATE_TYPE_STANDBY		0
+#define PSCI_POWER_STATE_TYPE_POWER_DOWN	1
+
+struct psci_power_state {
+	u16	id;
+	u8	type;
+	u8	affinity_level;
+};
+
+struct psci_operations {
+	int (*cpu_suspend)(struct psci_power_state state,
+			   unsigned long entry_point);
+	int (*cpu_off)(struct psci_power_state state);
+	int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
+	int (*migrate)(unsigned long cpuid);
+	int (*affinity_info)(unsigned long target_affinity,
+			unsigned long lowest_affinity_level);
+	int (*migrate_info_type)(void);
+};
+
+static struct psci_operations psci_ops;
 
 static int (*invoke_psci_fn)(u64, u64, u64, u64);
+typedef int (*psci_initcall_t)(const struct device_node *);
 
 enum psci_function {
 	PSCI_FN_CPU_SUSPEND,
 	PSCI_FN_CPU_ON,
 	PSCI_FN_CPU_OFF,
 	PSCI_FN_MIGRATE,
+	PSCI_FN_AFFINITY_INFO,
+	PSCI_FN_MIGRATE_INFO_TYPE,
 	PSCI_FN_MAX,
 };
 
 static u32 psci_function_id[PSCI_FN_MAX];
 
-#define PSCI_RET_SUCCESS		0
-#define PSCI_RET_EOPNOTSUPP		-1
-#define PSCI_RET_EINVAL			-2
-#define PSCI_RET_EPERM			-3
-
 static int psci_to_linux_errno(int errno)
 {
 	switch (errno) {
 	case PSCI_RET_SUCCESS:
 		return 0;
-	case PSCI_RET_EOPNOTSUPP:
+	case PSCI_RET_NOT_SUPPORTED:
 		return -EOPNOTSUPP;
-	case PSCI_RET_EINVAL:
+	case PSCI_RET_INVALID_PARAMS:
 		return -EINVAL;
-	case PSCI_RET_EPERM:
+	case PSCI_RET_DENIED:
 		return -EPERM;
 	};
 
 	return -EINVAL;
 }
 
-#define PSCI_POWER_STATE_ID_MASK	0xffff
-#define PSCI_POWER_STATE_ID_SHIFT	0
-#define PSCI_POWER_STATE_TYPE_MASK	0x1
-#define PSCI_POWER_STATE_TYPE_SHIFT	16
-#define PSCI_POWER_STATE_AFFL_MASK	0x3
-#define PSCI_POWER_STATE_AFFL_SHIFT	24
-
 static u32 psci_power_state_pack(struct psci_power_state state)
 {
-	return	((state.id & PSCI_POWER_STATE_ID_MASK)
-			<< PSCI_POWER_STATE_ID_SHIFT)	|
-		((state.type & PSCI_POWER_STATE_TYPE_MASK)
-			<< PSCI_POWER_STATE_TYPE_SHIFT)	|
-		((state.affinity_level & PSCI_POWER_STATE_AFFL_MASK)
-			<< PSCI_POWER_STATE_AFFL_SHIFT);
+	return ((state.id << PSCI_0_2_POWER_STATE_ID_SHIFT)
+			& PSCI_0_2_POWER_STATE_ID_MASK) |
+		((state.type << PSCI_0_2_POWER_STATE_TYPE_SHIFT)
+		 & PSCI_0_2_POWER_STATE_TYPE_MASK) |
+		((state.affinity_level << PSCI_0_2_POWER_STATE_AFFL_SHIFT)
+		 & PSCI_0_2_POWER_STATE_AFFL_MASK);
 }
 
 /*
@@ -108,6 +127,14 @@ static noinline int __invoke_psci_fn_smc(u64 function_id, u64 arg0, u64 arg1,
 	return function_id;
 }
 
+static int psci_get_version(void)
+{
+	int err;
+
+	err = invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0);
+	return err;
+}
+
 static int psci_cpu_suspend(struct psci_power_state state,
 			    unsigned long entry_point)
 {
@@ -151,28 +178,36 @@ static int psci_migrate(unsigned long cpuid)
 	return psci_to_linux_errno(err);
 }
 
-static const struct of_device_id psci_of_match[] __initconst = {
-	{ .compatible = "arm,psci",	},
-	{},
-};
+static int psci_affinity_info(unsigned long target_affinity,
+		unsigned long lowest_affinity_level)
+{
+	int err;
+	u32 fn;
 
-int __init psci_init(void)
+	fn = psci_function_id[PSCI_FN_AFFINITY_INFO];
+	err = invoke_psci_fn(fn, target_affinity, lowest_affinity_level, 0);
+	return err;
+}
+
+static int psci_migrate_info_type(void)
 {
-	struct device_node *np;
-	const char *method;
-	u32 id;
-	int err = 0;
+	int err;
+	u32 fn;
 
-	np = of_find_matching_node(NULL, psci_of_match);
-	if (!np)
-		return -ENODEV;
+	fn = psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE];
+	err = invoke_psci_fn(fn, 0, 0, 0);
+	return err;
+}
+
+static int get_set_conduit_method(struct device_node *np)
+{
+	const char *method;
 
-	pr_info("probing function IDs from device-tree\n");
+	pr_info("probing for conduit method from DT.\n");
 
 	if (of_property_read_string(np, "method", &method)) {
-		pr_warning("missing \"method\" property\n");
-		err = -ENXIO;
-		goto out_put_node;
+		pr_warn("missing \"method\" property\n");
+		return -ENXIO;
 	}
 
 	if (!strcmp("hvc", method)) {
@@ -180,11 +215,99 @@ int __init psci_init(void)
 	} else if (!strcmp("smc", method)) {
 		invoke_psci_fn = __invoke_psci_fn_smc;
 	} else {
-		pr_warning("invalid \"method\" property: %s\n", method);
-		err = -EINVAL;
+		pr_warn("invalid \"method\" property: %s\n", method);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void psci_sys_reset(enum reboot_mode reboot_mode, const char *cmd)
+{
+	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_RESET, 0, 0, 0);
+}
+
+static void psci_sys_poweroff(void)
+{
+	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
+}
+
+/*
+ * PSCI Function IDs for v0.2+ are well defined so use
+ * standard values.
+ */
+static int psci_0_2_init(struct device_node *np)
+{
+	int err, ver;
+
+	err = get_set_conduit_method(np);
+
+	if (err)
 		goto out_put_node;
+
+	ver = psci_get_version();
+
+	if (ver == PSCI_RET_NOT_SUPPORTED) {
+		/* PSCI v0.2 mandates implementation of PSCI_ID_VERSION. */
+		pr_err("PSCI firmware does not comply with the v0.2 spec.\n");
+		err = -EOPNOTSUPP;
+		goto out_put_node;
+	} else {
+		pr_info("PSCIv%d.%d detected in firmware.\n",
+				PSCI_VERSION_MAJOR(ver),
+				PSCI_VERSION_MINOR(ver));
+
+		if (PSCI_VERSION_MAJOR(ver) == 0 &&
+				PSCI_VERSION_MINOR(ver) < 2) {
+			err = -EINVAL;
+			pr_err("Conflicting PSCI version detected.\n");
+			goto out_put_node;
+		}
 	}
 
+	pr_info("Using standard PSCI v0.2 function IDs\n");
+	psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_0_2_FN64_CPU_SUSPEND;
+	psci_ops.cpu_suspend = psci_cpu_suspend;
+
+	psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF;
+	psci_ops.cpu_off = psci_cpu_off;
+
+	psci_function_id[PSCI_FN_CPU_ON] = PSCI_0_2_FN64_CPU_ON;
+	psci_ops.cpu_on = psci_cpu_on;
+
+	psci_function_id[PSCI_FN_MIGRATE] = PSCI_0_2_FN64_MIGRATE;
+	psci_ops.migrate = psci_migrate;
+
+	psci_function_id[PSCI_FN_AFFINITY_INFO] = PSCI_0_2_FN64_AFFINITY_INFO;
+	psci_ops.affinity_info = psci_affinity_info;
+
+	psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE] =
+		PSCI_0_2_FN_MIGRATE_INFO_TYPE;
+	psci_ops.migrate_info_type = psci_migrate_info_type;
+
+	arm_pm_restart = psci_sys_reset;
+
+	pm_power_off = psci_sys_poweroff;
+
+out_put_node:
+	of_node_put(np);
+	return err;
+}
+
+/*
+ * PSCI < v0.2 get PSCI Function IDs via DT.
+ */
+static int psci_0_1_init(struct device_node *np)
+{
+	u32 id;
+	int err;
+
+	err = get_set_conduit_method(np);
+
+	if (err)
+		goto out_put_node;
+
+	pr_info("Using PSCI v0.1 Function IDs from DT\n");
+
 	if (!of_property_read_u32(np, "cpu_suspend", &id)) {
 		psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
 		psci_ops.cpu_suspend = psci_cpu_suspend;
@@ -209,3 +332,119 @@ out_put_node:
 	of_node_put(np);
 	return err;
 }
+
+static const struct of_device_id psci_of_match[] __initconst = {
+	{ .compatible = "arm,psci",	.data = psci_0_1_init},
+	{ .compatible = "arm,psci-0.2",	.data = psci_0_2_init},
+	{},
+};
+
+int __init psci_init(void)
+{
+	struct device_node *np;
+	const struct of_device_id *matched_np;
+	psci_initcall_t init_fn;
+
+	np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);
+
+	if (!np)
+		return -ENODEV;
+
+	init_fn = (psci_initcall_t)matched_np->data;
+	return init_fn(np);
+}
+
+#ifdef CONFIG_SMP
+
+static int __init cpu_psci_cpu_init(struct device_node *dn, unsigned int cpu)
+{
+	return 0;
+}
+
+static int __init cpu_psci_cpu_prepare(unsigned int cpu)
+{
+	if (!psci_ops.cpu_on) {
+		pr_err("no cpu_on method, not booting CPU%d\n", cpu);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static int cpu_psci_cpu_boot(unsigned int cpu)
+{
+	int err = psci_ops.cpu_on(cpu_logical_map(cpu), __pa(secondary_entry));
+	if (err)
+		pr_err("failed to boot CPU%d (%d)\n", cpu, err);
+
+	return err;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int cpu_psci_cpu_disable(unsigned int cpu)
+{
+	/* Fail early if we don't have CPU_OFF support */
+	if (!psci_ops.cpu_off)
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+static void cpu_psci_cpu_die(unsigned int cpu)
+{
+	int ret;
+	/*
+	 * There are no known implementations of PSCI actually using the
+	 * power state field, pass a sensible default for now.
+	 */
+	struct psci_power_state state = {
+		.type = PSCI_POWER_STATE_TYPE_POWER_DOWN,
+	};
+
+	ret = psci_ops.cpu_off(state);
+
+	pr_crit("unable to power off CPU%u (%d)\n", cpu, ret);
+}
+
+static int cpu_psci_cpu_kill(unsigned int cpu)
+{
+	int err, i;
+
+	if (!psci_ops.affinity_info)
+		return 1;
+	/*
+	 * cpu_kill could race with cpu_die and we can
+	 * potentially end up declaring this cpu undead
+	 * while it is dying. So, try again a few times.
+	 */
+
+	for (i = 0; i < 10; i++) {
+		err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
+		if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) {
+			pr_info("CPU%d killed.\n", cpu);
+			return 1;
+		}
+
+		msleep(10);
+		pr_info("Retrying again to check for CPU kill\n");
+	}
+
+	pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n",
+			cpu, err);
+	/* Make op_cpu_kill() fail. */
+	return 0;
+}
+#endif
+
+const struct cpu_operations cpu_psci_ops = {
+	.name		= "psci",
+	.cpu_init	= cpu_psci_cpu_init,
+	.cpu_prepare	= cpu_psci_cpu_prepare,
+	.cpu_boot	= cpu_psci_cpu_boot,
+#ifdef CONFIG_HOTPLUG_CPU
+	.cpu_disable	= cpu_psci_cpu_disable,
+	.cpu_die	= cpu_psci_cpu_die,
+	.cpu_kill	= cpu_psci_cpu_kill,
+#endif
+};
+
+#endif
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index fecdbf7de82..9fde010c945 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -19,6 +19,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/compat.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
@@ -41,6 +42,9 @@
 #include <asm/traps.h>
 #include <asm/system_misc.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
 /*
  * TODO: does not yet catch signals sent when the child dies.
  * in exit.c or in signal.c.
@@ -214,31 +218,29 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
 {
 	int err, len, type, disabled = !ctrl.enabled;
 
-	if (disabled) {
-		len = 0;
-		type = HW_BREAKPOINT_EMPTY;
-	} else {
-		err = arch_bp_generic_fields(ctrl, &len, &type);
-		if (err)
-			return err;
-
-		switch (note_type) {
-		case NT_ARM_HW_BREAK:
-			if ((type & HW_BREAKPOINT_X) != type)
-				return -EINVAL;
-			break;
-		case NT_ARM_HW_WATCH:
-			if ((type & HW_BREAKPOINT_RW) != type)
-				return -EINVAL;
-			break;
-		default:
+	attr->disabled = disabled;
+	if (disabled)
+		return 0;
+
+	err = arch_bp_generic_fields(ctrl, &len, &type);
+	if (err)
+		return err;
+
+	switch (note_type) {
+	case NT_ARM_HW_BREAK:
+		if ((type & HW_BREAKPOINT_X) != type)
 			return -EINVAL;
-		}
+		break;
+	case NT_ARM_HW_WATCH:
+		if ((type & HW_BREAKPOINT_RW) != type)
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
 	}
 
 	attr->bp_len	= len;
 	attr->bp_type	= type;
-	attr->disabled	= disabled;
 
 	return 0;
 }
@@ -519,6 +521,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	target->thread.fpsimd_state.user_fpsimd = newstate;
+	fpsimd_flush_task_state(target);
 	return ret;
 }
 
@@ -636,28 +639,32 @@ static int compat_gpr_get(struct task_struct *target,
 
 	for (i = 0; i < num_regs; ++i) {
 		unsigned int idx = start + i;
-		void *reg;
+		compat_ulong_t reg;
 
 		switch (idx) {
 		case 15:
-			reg = (void *)&task_pt_regs(target)->pc;
+			reg = task_pt_regs(target)->pc;
 			break;
 		case 16:
-			reg = (void *)&task_pt_regs(target)->pstate;
+			reg = task_pt_regs(target)->pstate;
 			break;
 		case 17:
-			reg = (void *)&task_pt_regs(target)->orig_x0;
+			reg = task_pt_regs(target)->orig_x0;
 			break;
 		default:
-			reg = (void *)&task_pt_regs(target)->regs[idx];
+			reg = task_pt_regs(target)->regs[idx];
 		}
 
-		ret = copy_to_user(ubuf, reg, sizeof(compat_ulong_t));
+		if (kbuf) {
+			memcpy(kbuf, &reg, sizeof(reg));
+			kbuf += sizeof(reg);
+		} else {
+			ret = copy_to_user(ubuf, &reg, sizeof(reg));
+			if (ret)
+				break;
 
-		if (ret)
-			break;
-		else
-			ubuf += sizeof(compat_ulong_t);
+			ubuf += sizeof(reg);
+		}
 	}
 
 	return ret;
@@ -685,28 +692,33 @@ static int compat_gpr_set(struct task_struct *target,
 
 	for (i = 0; i < num_regs; ++i) {
 		unsigned int idx = start + i;
-		void *reg;
+		compat_ulong_t reg;
+
+		if (kbuf) {
+			memcpy(&reg, kbuf, sizeof(reg));
+			kbuf += sizeof(reg);
+		} else {
+			ret = copy_from_user(&reg, ubuf, sizeof(reg));
+			if (ret)
+				return ret;
+
+			ubuf += sizeof(reg);
+		}
 
 		switch (idx) {
 		case 15:
-			reg = (void *)&newregs.pc;
+			newregs.pc = reg;
 			break;
 		case 16:
-			reg = (void *)&newregs.pstate;
+			newregs.pstate = reg;
 			break;
 		case 17:
-			reg = (void *)&newregs.orig_x0;
+			newregs.orig_x0 = reg;
 			break;
 		default:
-			reg = (void *)&newregs.regs[idx];
+			newregs.regs[idx] = reg;
 		}
 
-		ret = copy_from_user(reg, ubuf, sizeof(compat_ulong_t));
-
-		if (ret)
-			goto out;
-		else
-			ubuf += sizeof(compat_ulong_t);
 	}
 
 	if (valid_user_regs(&newregs.user_regs))
@@ -714,7 +726,6 @@ static int compat_gpr_set(struct task_struct *target,
 	else
 		ret = -EINVAL;
 
-out:
 	return ret;
 }
 
@@ -768,6 +779,7 @@ static int compat_vfp_set(struct task_struct *target,
 		uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK;
 	}
 
+	fpsimd_flush_task_state(target);
 	return ret;
 }
 
@@ -825,6 +837,7 @@ static int compat_ptrace_write_user(struct task_struct *tsk, compat_ulong_t off,
 				    compat_ulong_t val)
 {
 	int ret;
+	mm_segment_t old_fs = get_fs();
 
 	if (off & 3 || off >= COMPAT_USER_SZ)
 		return -EIO;
@@ -832,10 +845,13 @@ static int compat_ptrace_write_user(struct task_struct *tsk, compat_ulong_t off,
 	if (off >= sizeof(compat_elf_gregset_t))
 		return 0;
 
+	set_fs(KERNEL_DS);
 	ret = copy_regset_from_user(tsk, &user_aarch32_view,
 				    REGSET_COMPAT_GPR, off,
 				    sizeof(compat_ulong_t),
 				    &val);
+	set_fs(old_fs);
+
 	return ret;
 }
 
@@ -1062,35 +1078,49 @@ long arch_ptrace(struct task_struct *child, long request,
 	return ptrace_request(child, request, addr, data);
 }
 
-asmlinkage int syscall_trace(int dir, struct pt_regs *regs)
+enum ptrace_syscall_dir {
+	PTRACE_SYSCALL_ENTER = 0,
+	PTRACE_SYSCALL_EXIT,
+};
+
+static void tracehook_report_syscall(struct pt_regs *regs,
+				     enum ptrace_syscall_dir dir)
 {
+	int regno;
 	unsigned long saved_reg;
 
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return regs->syscallno;
-
-	if (is_compat_task()) {
-		/* AArch32 uses ip (r12) for scratch */
-		saved_reg = regs->regs[12];
-		regs->regs[12] = dir;
-	} else {
-		/*
-		 * Save X7. X7 is used to denote syscall entry/exit:
-		 *   X7 = 0 -> entry, = 1 -> exit
-		 */
-		saved_reg = regs->regs[7];
-		regs->regs[7] = dir;
-	}
+	/*
+	 * A scratch register (ip(r12) on AArch32, x7 on AArch64) is
+	 * used to denote syscall entry/exit:
+	 */
+	regno = (is_compat_task() ? 12 : 7);
+	saved_reg = regs->regs[regno];
+	regs->regs[regno] = dir;
 
-	if (dir)
+	if (dir == PTRACE_SYSCALL_EXIT)
 		tracehook_report_syscall_exit(regs, 0);
 	else if (tracehook_report_syscall_entry(regs))
 		regs->syscallno = ~0UL;
 
-	if (is_compat_task())
-		regs->regs[12] = saved_reg;
-	else
-		regs->regs[7] = saved_reg;
+	regs->regs[regno] = saved_reg;
+}
+
+asmlinkage int syscall_trace_enter(struct pt_regs *regs)
+{
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+		trace_sys_enter(regs, regs->syscallno);
 
 	return regs->syscallno;
 }
+
+asmlinkage void syscall_trace_exit(struct pt_regs *regs)
+{
+	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+		trace_sys_exit(regs, regs_return_value(regs));
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT);
+}
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c
new file mode 100644
index 00000000000..89102a6ffad
--- /dev/null
+++ b/arch/arm64/kernel/return_address.c
@@ -0,0 +1,55 @@
+/*
+ * arch/arm64/kernel/return_address.c
+ *
+ * Copyright (C) 2013 Linaro Limited
+ * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/export.h>
+#include <linux/ftrace.h>
+
+#include <asm/stacktrace.h>
+
+struct return_address_data {
+	unsigned int level;
+	void *addr;
+};
+
+static int save_return_addr(struct stackframe *frame, void *d)
+{
+	struct return_address_data *data = d;
+
+	if (!data->level) {
+		data->addr = (void *)frame->pc;
+		return 1;
+	} else {
+		--data->level;
+		return 0;
+	}
+}
+
+void *return_address(unsigned int level)
+{
+	struct return_address_data data;
+	struct stackframe frame;
+	register unsigned long current_sp asm ("sp");
+
+	data.level = level + 2;
+	data.addr = NULL;
+
+	frame.fp = (unsigned long)__builtin_frame_address(0);
+	frame.sp = current_sp;
+	frame.pc = (unsigned long)return_address; /* dummy */
+
+	walk_stackframe(&frame, save_return_addr, &data);
+
+	if (!data.level)
+		return data.addr;
+	else
+		return NULL;
+}
+EXPORT_SYMBOL_GPL(return_address);
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 055cfb80e05..46d1125571f 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -25,6 +25,7 @@
 #include <linux/utsname.h>
 #include <linux/initrd.h>
 #include <linux/console.h>
+#include <linux/cache.h>
 #include <linux/bootmem.h>
 #include <linux/seq_file.h>
 #include <linux/screen_info.h>
@@ -41,10 +42,13 @@
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
 #include <linux/of_platform.h>
+#include <linux/efi.h>
 
+#include <asm/fixmap.h>
 #include <asm/cputype.h>
 #include <asm/elf.h>
 #include <asm/cputable.h>
+#include <asm/cpu_ops.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
 #include <asm/smp_plat.h>
@@ -53,6 +57,7 @@
 #include <asm/traps.h>
 #include <asm/memblock.h>
 #include <asm/psci.h>
+#include <asm/efi.h>
 
 unsigned int processor_id;
 EXPORT_SYMBOL(processor_id);
@@ -60,6 +65,17 @@ EXPORT_SYMBOL(processor_id);
 unsigned long elf_hwcap __read_mostly;
 EXPORT_SYMBOL_GPL(elf_hwcap);
 
+#ifdef CONFIG_COMPAT
+#define COMPAT_ELF_HWCAP_DEFAULT	\
+				(COMPAT_HWCAP_HALF|COMPAT_HWCAP_THUMB|\
+				 COMPAT_HWCAP_FAST_MULT|COMPAT_HWCAP_EDSP|\
+				 COMPAT_HWCAP_TLS|COMPAT_HWCAP_VFP|\
+				 COMPAT_HWCAP_VFPv3|COMPAT_HWCAP_VFPv4|\
+				 COMPAT_HWCAP_NEON|COMPAT_HWCAP_IDIV)
+unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT;
+unsigned int compat_elf_hwcap2 __read_mostly;
+#endif
+
 static const char *cpu_name;
 static const char *machine_name;
 phys_addr_t __fdt_pointer __initdata;
@@ -97,15 +113,97 @@ void __init early_print(const char *str, ...)
 	printk("%s", buf);
 }
 
-static void __init setup_processor(void)
+void __init smp_setup_processor_id(void)
 {
-	struct cpu_info *cpu_info;
+	/*
+	 * clear __my_cpu_offset on boot CPU to avoid hang caused by
+	 * using percpu variable early, for example, lockdep will
+	 * access percpu variable inside lock_release
+	 */
+	set_my_cpu_offset(0);
+}
+
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
+{
+	return phys_id == cpu_logical_map(cpu);
+}
 
+struct mpidr_hash mpidr_hash;
+#ifdef CONFIG_SMP
+/**
+ * smp_build_mpidr_hash - Pre-compute shifts required at each affinity
+ *			  level in order to build a linear index from an
+ *			  MPIDR value. Resulting algorithm is a collision
+ *			  free hash carried out through shifting and ORing
+ */
+static void __init smp_build_mpidr_hash(void)
+{
+	u32 i, affinity, fs[4], bits[4], ls;
+	u64 mask = 0;
+	/*
+	 * Pre-scan the list of MPIDRS and filter out bits that do
+	 * not contribute to affinity levels, ie they never toggle.
+	 */
+	for_each_possible_cpu(i)
+		mask |= (cpu_logical_map(i) ^ cpu_logical_map(0));
+	pr_debug("mask of set bits %#llx\n", mask);
 	/*
-	 * locate processor in the list of supported processor
-	 * types.  The linker builds this table for us from the
-	 * entries in arch/arm/mm/proc.S
+	 * Find and stash the last and first bit set at all affinity levels to
+	 * check how many bits are required to represent them.
 	 */
+	for (i = 0; i < 4; i++) {
+		affinity = MPIDR_AFFINITY_LEVEL(mask, i);
+		/*
+		 * Find the MSB bit and LSB bits position
+		 * to determine how many bits are required
+		 * to express the affinity level.
+		 */
+		ls = fls(affinity);
+		fs[i] = affinity ? ffs(affinity) - 1 : 0;
+		bits[i] = ls - fs[i];
+	}
+	/*
+	 * An index can be created from the MPIDR_EL1 by isolating the
+	 * significant bits at each affinity level and by shifting
+	 * them in order to compress the 32 bits values space to a
+	 * compressed set of values. This is equivalent to hashing
+	 * the MPIDR_EL1 through shifting and ORing. It is a collision free
+	 * hash though not minimal since some levels might contain a number
+	 * of CPUs that is not an exact power of 2 and their bit
+	 * representation might contain holes, eg MPIDR_EL1[7:0] = {0x2, 0x80}.
+	 */
+	mpidr_hash.shift_aff[0] = MPIDR_LEVEL_SHIFT(0) + fs[0];
+	mpidr_hash.shift_aff[1] = MPIDR_LEVEL_SHIFT(1) + fs[1] - bits[0];
+	mpidr_hash.shift_aff[2] = MPIDR_LEVEL_SHIFT(2) + fs[2] -
+						(bits[1] + bits[0]);
+	mpidr_hash.shift_aff[3] = MPIDR_LEVEL_SHIFT(3) +
+				  fs[3] - (bits[2] + bits[1] + bits[0]);
+	mpidr_hash.mask = mask;
+	mpidr_hash.bits = bits[3] + bits[2] + bits[1] + bits[0];
+	pr_debug("MPIDR hash: aff0[%u] aff1[%u] aff2[%u] aff3[%u] mask[%#llx] bits[%u]\n",
+		mpidr_hash.shift_aff[0],
+		mpidr_hash.shift_aff[1],
+		mpidr_hash.shift_aff[2],
+		mpidr_hash.shift_aff[3],
+		mpidr_hash.mask,
+		mpidr_hash.bits);
+	/*
+	 * 4x is an arbitrary value used to warn on a hash table much bigger
+	 * than expected on most systems.
+	 */
+	if (mpidr_hash_size() > 4 * num_possible_cpus())
+		pr_warn("Large number of MPIDR hash buckets detected\n");
+	__flush_dcache_area(&mpidr_hash, sizeof(struct mpidr_hash));
+}
+#endif
+
+static void __init setup_processor(void)
+{
+	struct cpu_info *cpu_info;
+	u64 features, block;
+	u32 cwg;
+	int cls;
+
 	cpu_info = lookup_processor_type(read_cpuid_id());
 	if (!cpu_info) {
 		printk("CPU configuration botched (ID %08x), unable to continue.\n",
@@ -118,76 +216,99 @@ static void __init setup_processor(void)
 	printk("CPU: %s [%08x] revision %d\n",
 	       cpu_name, read_cpuid_id(), read_cpuid_id() & 15);
 
-	sprintf(init_utsname()->machine, "aarch64");
+	sprintf(init_utsname()->machine, ELF_PLATFORM);
 	elf_hwcap = 0;
-}
 
-static void __init setup_machine_fdt(phys_addr_t dt_phys)
-{
-	struct boot_param_header *devtree;
-	unsigned long dt_root;
+	/*
+	 * Check for sane CTR_EL0.CWG value.
+	 */
+	cwg = cache_type_cwg();
+	cls = cache_line_size();
+	if (!cwg)
+		pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n",
+			cls);
+	if (L1_CACHE_BYTES < cls)
+		pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n",
+			L1_CACHE_BYTES, cls);
 
-	/* Check we have a non-NULL DT pointer */
-	if (!dt_phys) {
-		early_print("\n"
-			"Error: NULL or invalid device tree blob\n"
-			"The dtb must be 8-byte aligned and passed in the first 512MB of memory\n"
-			"\nPlease check your bootloader.\n");
+	/*
+	 * ID_AA64ISAR0_EL1 contains 4-bit wide signed feature blocks.
+	 * The blocks we test below represent incremental functionality
+	 * for non-negative values. Negative values are reserved.
+	 */
+	features = read_cpuid(ID_AA64ISAR0_EL1);
+	block = (features >> 4) & 0xf;
+	if (!(block & 0x8)) {
+		switch (block) {
+		default:
+		case 2:
+			elf_hwcap |= HWCAP_PMULL;
+		case 1:
+			elf_hwcap |= HWCAP_AES;
+		case 0:
+			break;
+		}
+	}
 
-		while (true)
-			cpu_relax();
+	block = (features >> 8) & 0xf;
+	if (block && !(block & 0x8))
+		elf_hwcap |= HWCAP_SHA1;
+
+	block = (features >> 12) & 0xf;
+	if (block && !(block & 0x8))
+		elf_hwcap |= HWCAP_SHA2;
+
+	block = (features >> 16) & 0xf;
+	if (block && !(block & 0x8))
+		elf_hwcap |= HWCAP_CRC32;
 
+#ifdef CONFIG_COMPAT
+	/*
+	 * ID_ISAR5_EL1 carries similar information as above, but pertaining to
+	 * the Aarch32 32-bit execution state.
+	 */
+	features = read_cpuid(ID_ISAR5_EL1);
+	block = (features >> 4) & 0xf;
+	if (!(block & 0x8)) {
+		switch (block) {
+		default:
+		case 2:
+			compat_elf_hwcap2 |= COMPAT_HWCAP2_PMULL;
+		case 1:
+			compat_elf_hwcap2 |= COMPAT_HWCAP2_AES;
+		case 0:
+			break;
+		}
 	}
 
-	devtree = phys_to_virt(dt_phys);
+	block = (features >> 8) & 0xf;
+	if (block && !(block & 0x8))
+		compat_elf_hwcap2 |= COMPAT_HWCAP2_SHA1;
 
-	/* Check device tree validity */
-	if (be32_to_cpu(devtree->magic) != OF_DT_HEADER) {
+	block = (features >> 12) & 0xf;
+	if (block && !(block & 0x8))
+		compat_elf_hwcap2 |= COMPAT_HWCAP2_SHA2;
+
+	block = (features >> 16) & 0xf;
+	if (block && !(block & 0x8))
+		compat_elf_hwcap2 |= COMPAT_HWCAP2_CRC32;
+#endif
+}
+
+static void __init setup_machine_fdt(phys_addr_t dt_phys)
+{
+	if (!dt_phys || !early_init_dt_scan(phys_to_virt(dt_phys))) {
 		early_print("\n"
 			"Error: invalid device tree blob at physical address 0x%p (virtual address 0x%p)\n"
-			"Expected 0x%x, found 0x%x\n"
+			"The dtb must be 8-byte aligned and passed in the first 512MB of memory\n"
 			"\nPlease check your bootloader.\n",
-			dt_phys, devtree, OF_DT_HEADER,
-			be32_to_cpu(devtree->magic));
+			dt_phys, phys_to_virt(dt_phys));
 
 		while (true)
 			cpu_relax();
 	}
 
-	initial_boot_params = devtree;
-	dt_root = of_get_flat_dt_root();
-
-	machine_name = of_get_flat_dt_prop(dt_root, "model", NULL);
-	if (!machine_name)
-		machine_name = of_get_flat_dt_prop(dt_root, "compatible", NULL);
-	if (!machine_name)
-		machine_name = "<unknown>";
-	pr_info("Machine: %s\n", machine_name);
-
-	/* Retrieve various information from the /chosen node */
-	of_scan_flat_dt(early_init_dt_scan_chosen, boot_command_line);
-	/* Initialize {size,address}-cells info */
-	of_scan_flat_dt(early_init_dt_scan_root, NULL);
-	/* Setup memory, calling early_init_dt_add_memory_arch */
-	of_scan_flat_dt(early_init_dt_scan_memory, NULL);
-}
-
-void __init early_init_dt_add_memory_arch(u64 base, u64 size)
-{
-	base &= PAGE_MASK;
-	size &= PAGE_MASK;
-	if (base + size < PHYS_OFFSET) {
-		pr_warning("Ignoring memory block 0x%llx - 0x%llx\n",
-			   base, base + size);
-		return;
-	}
-	if (base < PHYS_OFFSET) {
-		pr_warning("Ignoring memory range 0x%llx - 0x%llx\n",
-			   base, PHYS_OFFSET);
-		size -= PHYS_OFFSET - base;
-		base = PHYS_OFFSET;
-	}
-	memblock_add(base, size);
+	machine_name = of_flat_dt_get_machine_name();
 }
 
 /*
@@ -241,6 +362,11 @@ u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
 
 void __init setup_arch(char **cmdline_p)
 {
+	/*
+	 * Unmask asynchronous aborts early to catch possible system errors.
+	 */
+	local_async_enable();
+
 	setup_processor();
 
 	setup_machine_fdt(__fdt_pointer);
@@ -252,20 +378,27 @@ void __init setup_arch(char **cmdline_p)
 
 	*cmdline_p = boot_command_line;
 
+	early_ioremap_init();
+
 	parse_early_param();
 
+	efi_init();
 	arm64_memblock_init();
 
 	paging_init();
 	request_standard_resources();
 
+	efi_idmap_init();
+
 	unflatten_device_tree();
 
 	psci_init();
 
 	cpu_logical_map(0) = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
+	cpu_read_bootcpu_ops();
 #ifdef CONFIG_SMP
 	smp_init_cpus();
+	smp_build_mpidr_hash();
 #endif
 
 #ifdef CONFIG_VT
@@ -279,11 +412,10 @@ void __init setup_arch(char **cmdline_p)
 
 static int __init arm64_device_init(void)
 {
-	of_clk_init(NULL);
 	of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL);
 	return 0;
 }
-arch_initcall(arm64_device_init);
+arch_initcall_sync(arm64_device_init);
 
 static DEFINE_PER_CPU(struct cpu, cpu_data);
 
@@ -304,6 +436,12 @@ subsys_initcall(topology_init);
 static const char *hwcap_str[] = {
 	"fp",
 	"asimd",
+	"evtstrm",
+	"aes",
+	"pmull",
+	"sha1",
+	"sha2",
+	"crc32",
 	NULL
 };
 
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 890a591f75d..6357b9c6c90 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -17,6 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/personality.h>
@@ -25,7 +26,6 @@
 #include <linux/tracehook.h>
 #include <linux/ratelimit.h>
 
-#include <asm/compat.h>
 #include <asm/debug-monitors.h>
 #include <asm/elf.h>
 #include <asm/cacheflush.h>
@@ -51,7 +51,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
 	int err;
 
 	/* dump the hardware registers to the fpsimd_state structure */
-	fpsimd_save_state(fpsimd);
+	fpsimd_preserve_current_state();
 
 	/* copy the FP and status/control registers */
 	err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
@@ -86,11 +86,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx)
 	__get_user_error(fpsimd.fpcr, &ctx->fpcr, err);
 
 	/* load the hardware registers from the fpsimd_state structure */
-	if (!err) {
-		preempt_disable();
-		fpsimd_load_state(&fpsimd);
-		preempt_enable();
-	}
+	if (!err)
+		fpsimd_update_current_state(&fpsimd);
 
 	return err ? -EFAULT : 0;
 }
@@ -100,8 +97,7 @@ static int restore_sigframe(struct pt_regs *regs,
 {
 	sigset_t set;
 	int i, err;
-	struct aux_context __user *aux =
-		(struct aux_context __user *)sf->uc.uc_mcontext.__reserved;
+	void *aux = sf->uc.uc_mcontext.__reserved;
 
 	err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set));
 	if (err == 0)
@@ -121,8 +117,11 @@ static int restore_sigframe(struct pt_regs *regs,
 
 	err |= !valid_user_regs(&regs->user_regs);
 
-	if (err == 0)
-		err |= restore_fpsimd_context(&aux->fpsimd);
+	if (err == 0) {
+		struct fpsimd_context *fpsimd_ctx =
+			container_of(aux, struct fpsimd_context, head);
+		err |= restore_fpsimd_context(fpsimd_ctx);
+	}
 
 	return err;
 }
@@ -167,8 +166,8 @@ static int setup_sigframe(struct rt_sigframe __user *sf,
 			  struct pt_regs *regs, sigset_t *set)
 {
 	int i, err = 0;
-	struct aux_context __user *aux =
-		(struct aux_context __user *)sf->uc.uc_mcontext.__reserved;
+	void *aux = sf->uc.uc_mcontext.__reserved;
+	struct _aarch64_ctx *end;
 
 	/* set up the stack frame for unwinding */
 	__put_user_error(regs->regs[29], &sf->fp, err);
@@ -185,12 +184,27 @@ static int setup_sigframe(struct rt_sigframe __user *sf,
 
 	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
 
-	if (err == 0)
-		err |= preserve_fpsimd_context(&aux->fpsimd);
+	if (err == 0) {
+		struct fpsimd_context *fpsimd_ctx =
+			container_of(aux, struct fpsimd_context, head);
+		err |= preserve_fpsimd_context(fpsimd_ctx);
+		aux += sizeof(*fpsimd_ctx);
+	}
+
+	/* fault information, if valid */
+	if (current->thread.fault_code) {
+		struct esr_context *esr_ctx =
+			container_of(aux, struct esr_context, head);
+		__put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
+		__put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
+		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
+		aux += sizeof(*esr_ctx);
+	}
 
 	/* set the "end" magic */
-	__put_user_error(0, &aux->end.magic, err);
-	__put_user_error(0, &aux->end.size, err);
+	end = aux;
+	__put_user_error(0, &end->magic, err);
+	__put_user_error(0, &end->size, err);
 
 	return err;
 }
@@ -416,4 +430,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
 	}
+
+	if (thread_flags & _TIF_FOREIGN_FPSTATE)
+		fpsimd_restore_current_state();
+
 }
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index e393174fe85..3491c638f17 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -23,6 +23,7 @@
 #include <linux/syscalls.h>
 #include <linux/ratelimit.h>
 
+#include <asm/esr.h>
 #include <asm/fpsimd.h>
 #include <asm/signal32.h>
 #include <asm/uaccess.h>
@@ -81,6 +82,8 @@ struct compat_vfp_sigframe {
 #define VFP_MAGIC		0x56465001
 #define VFP_STORAGE_SIZE	sizeof(struct compat_vfp_sigframe)
 
+#define FSR_WRITE_SHIFT		(11)
+
 struct compat_aux_sigframe {
 	struct compat_vfp_sigframe	vfp;
 
@@ -100,34 +103,6 @@ struct compat_rt_sigframe {
 
 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 
-/*
- * For ARM syscalls, the syscall number has to be loaded into r7.
- * We do not support an OABI userspace.
- */
-#define MOV_R7_NR_SIGRETURN	(0xe3a07000 | __NR_compat_sigreturn)
-#define SVC_SYS_SIGRETURN	(0xef000000 | __NR_compat_sigreturn)
-#define MOV_R7_NR_RT_SIGRETURN	(0xe3a07000 | __NR_compat_rt_sigreturn)
-#define SVC_SYS_RT_SIGRETURN	(0xef000000 | __NR_compat_rt_sigreturn)
-
-/*
- * For Thumb syscalls, we also pass the syscall number via r7. We therefore
- * need two 16-bit instructions.
- */
-#define SVC_THUMB_SIGRETURN	(((0xdf00 | __NR_compat_sigreturn) << 16) | \
-				   0x2700 | __NR_compat_sigreturn)
-#define SVC_THUMB_RT_SIGRETURN	(((0xdf00 | __NR_compat_rt_sigreturn) << 16) | \
-				   0x2700 | __NR_compat_rt_sigreturn)
-
-const compat_ulong_t aarch32_sigret_code[6] = {
-	/*
-	 * AArch32 sigreturn code.
-	 * We don't construct an OABI SWI - instead we just set the imm24 field
-	 * to the EABI syscall number so that we create a sane disassembly.
-	 */
-	MOV_R7_NR_SIGRETURN,    SVC_SYS_SIGRETURN,    SVC_THUMB_SIGRETURN,
-	MOV_R7_NR_RT_SIGRETURN, SVC_SYS_RT_SIGRETURN, SVC_THUMB_RT_SIGRETURN,
-};
-
 static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set)
 {
 	compat_sigset_t	cset;
@@ -150,7 +125,7 @@ static inline int get_sigset_t(sigset_t *set,
 	return 0;
 }
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
 {
 	int err;
 
@@ -247,7 +222,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
 	 * Note that this also saves V16-31, which aren't visible
 	 * in AArch32.
 	 */
-	fpsimd_save_state(fpsimd);
+	fpsimd_preserve_current_state();
 
 	/* Place structure header on the stack */
 	__put_user_error(magic, &frame->magic, err);
@@ -310,11 +285,8 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
 	 * We don't need to touch the exception register, so
 	 * reload the hardware state.
 	 */
-	if (!err) {
-		preempt_disable();
-		fpsimd_load_state(&fpsimd);
-		preempt_enable();
-	}
+	if (!err)
+		fpsimd_update_current_state(&fpsimd);
 
 	return err ? -EFAULT : 0;
 }
@@ -474,12 +446,13 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka,
 	/* Check if the handler is written for ARM or Thumb */
 	thumb = handler & 1;
 
-	if (thumb) {
+	if (thumb)
 		spsr |= COMPAT_PSR_T_BIT;
-		spsr &= ~COMPAT_PSR_IT_MASK;
-	} else {
+	else
 		spsr &= ~COMPAT_PSR_T_BIT;
-	}
+
+	/* The IT state must be cleared for both ARM and Thumb-2 */
+	spsr &= ~COMPAT_PSR_IT_MASK;
 
 	if (ka->sa.sa_flags & SA_RESTORER) {
 		retcode = ptr_to_compat(ka->sa.sa_restorer);
@@ -527,7 +500,9 @@ static int compat_setup_sigframe(struct compat_sigframe __user *sf,
 	__put_user_error(regs->pstate, &sf->uc.uc_mcontext.arm_cpsr, err);
 
 	__put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.trap_no, err);
-	__put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.error_code, err);
+	/* set the compat FSR WnR */
+	__put_user_error(!!(current->thread.fault_code & ESR_EL1_WRITE) <<
+			 FSR_WRITE_SHIFT, &sf->uc.uc_mcontext.error_code, err);
 	__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
 	__put_user_error(set->sig[0], &sf->uc.uc_mcontext.oldmask, err);
 
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
new file mode 100644
index 00000000000..b1925729c69
--- /dev/null
+++ b/arch/arm64/kernel/sleep.S
@@ -0,0 +1,184 @@
+#include <linux/errno.h>
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/assembler.h>
+
+	.text
+/*
+ * Implementation of MPIDR_EL1 hash algorithm through shifting
+ * and OR'ing.
+ *
+ * @dst: register containing hash result
+ * @rs0: register containing affinity level 0 bit shift
+ * @rs1: register containing affinity level 1 bit shift
+ * @rs2: register containing affinity level 2 bit shift
+ * @rs3: register containing affinity level 3 bit shift
+ * @mpidr: register containing MPIDR_EL1 value
+ * @mask: register containing MPIDR mask
+ *
+ * Pseudo C-code:
+ *
+ *u32 dst;
+ *
+ *compute_mpidr_hash(u32 rs0, u32 rs1, u32 rs2, u32 rs3, u64 mpidr, u64 mask) {
+ *	u32 aff0, aff1, aff2, aff3;
+ *	u64 mpidr_masked = mpidr & mask;
+ *	aff0 = mpidr_masked & 0xff;
+ *	aff1 = mpidr_masked & 0xff00;
+ *	aff2 = mpidr_masked & 0xff0000;
+ *	aff2 = mpidr_masked & 0xff00000000;
+ *	dst = (aff0 >> rs0 | aff1 >> rs1 | aff2 >> rs2 | aff3 >> rs3);
+ *}
+ * Input registers: rs0, rs1, rs2, rs3, mpidr, mask
+ * Output register: dst
+ * Note: input and output registers must be disjoint register sets
+         (eg: a macro instance with mpidr = x1 and dst = x1 is invalid)
+ */
+	.macro compute_mpidr_hash dst, rs0, rs1, rs2, rs3, mpidr, mask
+	and	\mpidr, \mpidr, \mask		// mask out MPIDR bits
+	and	\dst, \mpidr, #0xff		// mask=aff0
+	lsr	\dst ,\dst, \rs0		// dst=aff0>>rs0
+	and	\mask, \mpidr, #0xff00		// mask = aff1
+	lsr	\mask ,\mask, \rs1
+	orr	\dst, \dst, \mask		// dst|=(aff1>>rs1)
+	and	\mask, \mpidr, #0xff0000	// mask = aff2
+	lsr	\mask ,\mask, \rs2
+	orr	\dst, \dst, \mask		// dst|=(aff2>>rs2)
+	and	\mask, \mpidr, #0xff00000000	// mask = aff3
+	lsr	\mask ,\mask, \rs3
+	orr	\dst, \dst, \mask		// dst|=(aff3>>rs3)
+	.endm
+/*
+ * Save CPU state for a suspend.  This saves callee registers, and allocates
+ * space on the kernel stack to save the CPU specific registers + some
+ * other data for resume.
+ *
+ *  x0 = suspend finisher argument
+ */
+ENTRY(__cpu_suspend)
+	stp	x29, lr, [sp, #-96]!
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	mov	x2, sp
+	sub	sp, sp, #CPU_SUSPEND_SZ	// allocate cpu_suspend_ctx
+	mov	x1, sp
+	/*
+	 * x1 now points to struct cpu_suspend_ctx allocated on the stack
+	 */
+	str	x2, [x1, #CPU_CTX_SP]
+	ldr	x2, =sleep_save_sp
+	ldr	x2, [x2, #SLEEP_SAVE_SP_VIRT]
+#ifdef CONFIG_SMP
+	mrs	x7, mpidr_el1
+	ldr	x9, =mpidr_hash
+	ldr	x10, [x9, #MPIDR_HASH_MASK]
+	/*
+	 * Following code relies on the struct mpidr_hash
+	 * members size.
+	 */
+	ldp	w3, w4, [x9, #MPIDR_HASH_SHIFTS]
+	ldp	w5, w6, [x9, #(MPIDR_HASH_SHIFTS + 8)]
+	compute_mpidr_hash x8, x3, x4, x5, x6, x7, x10
+	add	x2, x2, x8, lsl #3
+#endif
+	bl	__cpu_suspend_finisher
+        /*
+	 * Never gets here, unless suspend fails.
+	 * Successful cpu_suspend should return from cpu_resume, returning
+	 * through this code path is considered an error
+	 * If the return value is set to 0 force x0 = -EOPNOTSUPP
+	 * to make sure a proper error condition is propagated
+	 */
+	cmp	x0, #0
+	mov	x3, #-EOPNOTSUPP
+	csel	x0, x3, x0, eq
+	add	sp, sp, #CPU_SUSPEND_SZ	// rewind stack pointer
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	x25, x26, [sp, #64]
+	ldp	x27, x28, [sp, #80]
+	ldp	x29, lr, [sp], #96
+	ret
+ENDPROC(__cpu_suspend)
+	.ltorg
+
+/*
+ * x0 must contain the sctlr value retrieved from restored context
+ */
+ENTRY(cpu_resume_mmu)
+	ldr	x3, =cpu_resume_after_mmu
+	msr	sctlr_el1, x0		// restore sctlr_el1
+	isb
+	br	x3			// global jump to virtual address
+ENDPROC(cpu_resume_mmu)
+cpu_resume_after_mmu:
+	mov	x0, #0			// return zero on success
+	ldp	x19, x20, [sp, #16]
+	ldp	x21, x22, [sp, #32]
+	ldp	x23, x24, [sp, #48]
+	ldp	x25, x26, [sp, #64]
+	ldp	x27, x28, [sp, #80]
+	ldp	x29, lr, [sp], #96
+	ret
+ENDPROC(cpu_resume_after_mmu)
+
+	.data
+ENTRY(cpu_resume)
+	bl	el2_setup		// if in EL2 drop to EL1 cleanly
+#ifdef CONFIG_SMP
+	mrs	x1, mpidr_el1
+	adr	x4, mpidr_hash_ptr
+	ldr	x5, [x4]
+	add	x8, x4, x5		// x8 = struct mpidr_hash phys address
+        /* retrieve mpidr_hash members to compute the hash */
+	ldr	x2, [x8, #MPIDR_HASH_MASK]
+	ldp	w3, w4, [x8, #MPIDR_HASH_SHIFTS]
+	ldp	w5, w6, [x8, #(MPIDR_HASH_SHIFTS + 8)]
+	compute_mpidr_hash x7, x3, x4, x5, x6, x1, x2
+        /* x7 contains hash index, let's use it to grab context pointer */
+#else
+	mov	x7, xzr
+#endif
+	adr	x0, sleep_save_sp
+	ldr	x0, [x0, #SLEEP_SAVE_SP_PHYS]
+	ldr	x0, [x0, x7, lsl #3]
+	/* load sp from context */
+	ldr	x2, [x0, #CPU_CTX_SP]
+	adr	x1, sleep_idmap_phys
+	/* load physical address of identity map page table in x1 */
+	ldr	x1, [x1]
+	mov	sp, x2
+	/*
+	 * cpu_do_resume expects x0 to contain context physical address
+	 * pointer and x1 to contain physical address of 1:1 page tables
+	 */
+	bl	cpu_do_resume		// PC relative jump, MMU off
+	b	cpu_resume_mmu		// Resume MMU, never returns
+ENDPROC(cpu_resume)
+
+	.align 3
+mpidr_hash_ptr:
+	/*
+	 * offset of mpidr_hash symbol from current location
+	 * used to obtain run-time mpidr_hash address with MMU off
+         */
+	.quad	mpidr_hash - .
+/*
+ * physical address of identity mapped page tables
+ */
+	.type	sleep_idmap_phys, #object
+ENTRY(sleep_idmap_phys)
+	.quad	0
+/*
+ * struct sleep_save_sp {
+ *	phys_addr_t *save_ptr_stash;
+ *	phys_addr_t save_ptr_stash_phys;
+ * };
+ */
+	.type	sleep_save_sp, #object
+ENTRY(sleep_save_sp)
+	.space	SLEEP_SAVE_SP_SZ	// struct sleep_save_sp
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 78db90dcc91..40f38f46c8e 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -35,10 +35,12 @@
 #include <linux/clockchips.h>
 #include <linux/completion.h>
 #include <linux/of.h>
+#include <linux/irq_work.h>
 
 #include <asm/atomic.h>
 #include <asm/cacheflush.h>
 #include <asm/cputype.h>
+#include <asm/cpu_ops.h>
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -54,70 +56,26 @@
  * where to place its SVC stack
  */
 struct secondary_data secondary_data;
-volatile unsigned long secondary_holding_pen_release = INVALID_HWID;
 
 enum ipi_msg_type {
 	IPI_RESCHEDULE,
 	IPI_CALL_FUNC,
 	IPI_CALL_FUNC_SINGLE,
 	IPI_CPU_STOP,
+	IPI_TIMER,
+	IPI_IRQ_WORK,
 };
 
-static DEFINE_RAW_SPINLOCK(boot_lock);
-
-/*
- * Write secondary_holding_pen_release in a way that is guaranteed to be
- * visible to all observers, irrespective of whether they're taking part
- * in coherency or not.  This is necessary for the hotplug code to work
- * reliably.
- */
-static void write_pen_release(u64 val)
-{
-	void *start = (void *)&secondary_holding_pen_release;
-	unsigned long size = sizeof(secondary_holding_pen_release);
-
-	secondary_holding_pen_release = val;
-	__flush_dcache_area(start, size);
-}
-
 /*
  * Boot a secondary CPU, and assign it the specified idle task.
  * This also gives us the initial stack to use for this CPU.
  */
 static int boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
-	unsigned long timeout;
-
-	/*
-	 * Set synchronisation state between this boot processor
-	 * and the secondary one
-	 */
-	raw_spin_lock(&boot_lock);
-
-	/*
-	 * Update the pen release flag.
-	 */
-	write_pen_release(cpu_logical_map(cpu));
-
-	/*
-	 * Send an event, causing the secondaries to read pen_release.
-	 */
-	sev();
-
-	timeout = jiffies + (1 * HZ);
-	while (time_before(jiffies, timeout)) {
-		if (secondary_holding_pen_release == INVALID_HWID)
-			break;
-		udelay(10);
-	}
+	if (cpu_ops[cpu]->cpu_boot)
+		return cpu_ops[cpu]->cpu_boot(cpu);
 
-	/*
-	 * Now the secondary core is starting up let it run its
-	 * calibrations, then wait for it to finish
-	 */
-	raw_spin_unlock(&boot_lock);
-
-	return secondary_holding_pen_release != INVALID_HWID ? -ENOSYS : 0;
+	return -EOPNOTSUPP;
 }
 
 static DECLARE_COMPLETION(cpu_running);
@@ -158,6 +116,11 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	return ret;
 }
 
+static void smp_store_cpu_info(unsigned int cpuid)
+{
+	store_cpu_topology(cpuid);
+}
+
 /*
  * This is the secondary CPU boot entry.  We're using this CPUs
  * idle thread stack, but a set of temporary page tables.
@@ -167,8 +130,6 @@ asmlinkage void secondary_start_kernel(void)
 	struct mm_struct *mm = &init_mm;
 	unsigned int cpu = smp_processor_id();
 
-	printk("CPU%u: Booted secondary processor\n", cpu);
-
 	/*
 	 * All kernel threads share the same mm context; grab a
 	 * reference and switch to it.
@@ -177,6 +138,9 @@ asmlinkage void secondary_start_kernel(void)
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
 
+	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
+	printk("CPU%u: Booted secondary processor\n", cpu);
+
 	/*
 	 * TTBR0 is only used for the identity mapping at this stage. Make it
 	 * point to zero page to avoid speculatively fetching new entries.
@@ -187,17 +151,15 @@ asmlinkage void secondary_start_kernel(void)
 	preempt_disable();
 	trace_hardirqs_off();
 
-	/*
-	 * Let the primary processor know we're out of the
-	 * pen, then head off into the C entry point
-	 */
-	write_pen_release(INVALID_HWID);
+	if (cpu_ops[cpu]->cpu_postboot)
+		cpu_ops[cpu]->cpu_postboot();
 
 	/*
-	 * Synchronise with the boot thread.
+	 * Enable GIC and timers.
 	 */
-	raw_spin_lock(&boot_lock);
-	raw_spin_unlock(&boot_lock);
+	notify_cpu_starting(cpu);
+
+	smp_store_cpu_info(cpu);
 
 	/*
 	 * OK, now it's safe to let the boot CPU continue.  Wait for
@@ -207,13 +169,9 @@ asmlinkage void secondary_start_kernel(void)
 	set_cpu_online(cpu, true);
 	complete(&cpu_running);
 
-	/*
-	 * Enable GIC and timers.
-	 */
-	notify_cpu_starting(cpu);
-
+	local_dbg_enable();
 	local_irq_enable();
-	local_fiq_enable();
+	local_async_enable();
 
 	/*
 	 * OK, it's off to the idle thread for us
@@ -221,39 +179,136 @@ asmlinkage void secondary_start_kernel(void)
 	cpu_startup_entry(CPUHP_ONLINE);
 }
 
-void __init smp_cpus_done(unsigned int max_cpus)
+#ifdef CONFIG_HOTPLUG_CPU
+static int op_cpu_disable(unsigned int cpu)
 {
-	pr_info("SMP: Total of %d processors activated.\n", num_online_cpus());
+	/*
+	 * If we don't have a cpu_die method, abort before we reach the point
+	 * of no return. CPU0 may not have an cpu_ops, so test for it.
+	 */
+	if (!cpu_ops[cpu] || !cpu_ops[cpu]->cpu_die)
+		return -EOPNOTSUPP;
+
+	/*
+	 * We may need to abort a hot unplug for some other mechanism-specific
+	 * reason.
+	 */
+	if (cpu_ops[cpu]->cpu_disable)
+		return cpu_ops[cpu]->cpu_disable(cpu);
+
+	return 0;
 }
 
-void __init smp_prepare_boot_cpu(void)
+/*
+ * __cpu_disable runs on the processor to be shutdown.
+ */
+int __cpu_disable(void)
 {
-}
+	unsigned int cpu = smp_processor_id();
+	int ret;
 
-static void (*smp_cross_call)(const struct cpumask *, unsigned int);
+	ret = op_cpu_disable(cpu);
+	if (ret)
+		return ret;
 
-static const struct smp_enable_ops *enable_ops[] __initconst = {
-	&smp_spin_table_ops,
-	&smp_psci_ops,
-	NULL,
-};
+	/*
+	 * Take this CPU offline.  Once we clear this, we can't return,
+	 * and we must not schedule until we're ready to give up the cpu.
+	 */
+	set_cpu_online(cpu, false);
 
-static const struct smp_enable_ops *smp_enable_ops[NR_CPUS];
+	/*
+	 * OK - migrate IRQs away from this CPU
+	 */
+	migrate_irqs();
 
-static const struct smp_enable_ops * __init smp_get_enable_ops(const char *name)
+	/*
+	 * Remove this CPU from the vm mask set of all processes.
+	 */
+	clear_tasks_mm_cpumask(cpu);
+
+	return 0;
+}
+
+static int op_cpu_kill(unsigned int cpu)
 {
-	const struct smp_enable_ops **ops = enable_ops;
+	/*
+	 * If we have no means of synchronising with the dying CPU, then assume
+	 * that it is really dead. We can only wait for an arbitrary length of
+	 * time and hope that it's dead, so let's skip the wait and just hope.
+	 */
+	if (!cpu_ops[cpu]->cpu_kill)
+		return 1;
+
+	return cpu_ops[cpu]->cpu_kill(cpu);
+}
 
-	while (*ops) {
-		if (!strcmp(name, (*ops)->name))
-			return *ops;
+static DECLARE_COMPLETION(cpu_died);
 
-		ops++;
+/*
+ * called on the thread which is asking for a CPU to be shutdown -
+ * waits until shutdown has completed, or it is timed out.
+ */
+void __cpu_die(unsigned int cpu)
+{
+	if (!wait_for_completion_timeout(&cpu_died, msecs_to_jiffies(5000))) {
+		pr_crit("CPU%u: cpu didn't die\n", cpu);
+		return;
 	}
+	pr_notice("CPU%u: shutdown\n", cpu);
+
+	/*
+	 * Now that the dying CPU is beyond the point of no return w.r.t.
+	 * in-kernel synchronisation, try to get the firwmare to help us to
+	 * verify that it has really left the kernel before we consider
+	 * clobbering anything it might still be using.
+	 */
+	if (!op_cpu_kill(cpu))
+		pr_warn("CPU%d may not have shut down cleanly\n", cpu);
+}
+
+/*
+ * Called from the idle thread for the CPU which has been shutdown.
+ *
+ * Note that we disable IRQs here, but do not re-enable them
+ * before returning to the caller. This is also the behaviour
+ * of the other hotplug-cpu capable cores, so presumably coming
+ * out of idle fixes this.
+ */
+void cpu_die(void)
+{
+	unsigned int cpu = smp_processor_id();
+
+	idle_task_exit();
+
+	local_irq_disable();
+
+	/* Tell __cpu_die() that this CPU is now safe to dispose of */
+	complete(&cpu_died);
+
+	/*
+	 * Actually shutdown the CPU. This must never fail. The specific hotplug
+	 * mechanism must perform all required cache maintenance to ensure that
+	 * no dirty lines are lost in the process of shutting down the CPU.
+	 */
+	cpu_ops[cpu]->cpu_die(cpu);
+
+	BUG();
+}
+#endif
+
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+	pr_info("SMP: Total of %d processors activated.\n", num_online_cpus());
+}
 
-	return NULL;
+void __init smp_prepare_boot_cpu(void)
+{
+	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
 }
 
+static void (*smp_cross_call)(const struct cpumask *, unsigned int);
+
 /*
  * Enumerate the possible CPU set from the device tree and build the
  * cpu logical map array containing MPIDR values related to logical
@@ -261,9 +316,8 @@ static const struct smp_enable_ops * __init smp_get_enable_ops(const char *name)
  */
 void __init smp_init_cpus(void)
 {
-	const char *enable_method;
 	struct device_node *dn = NULL;
-	int i, cpu = 1;
+	unsigned int i, cpu = 1;
 	bool bootcpu_valid = false;
 
 	while ((dn = of_find_node_by_type(dn, "cpu"))) {
@@ -332,25 +386,10 @@ void __init smp_init_cpus(void)
 		if (cpu >= NR_CPUS)
 			goto next;
 
-		/*
-		 * We currently support only the "spin-table" enable-method.
-		 */
-		enable_method = of_get_property(dn, "enable-method", NULL);
-		if (!enable_method) {
-			pr_err("%s: missing enable-method property\n",
-				dn->full_name);
-			goto next;
-		}
-
-		smp_enable_ops[cpu] = smp_get_enable_ops(enable_method);
-
-		if (!smp_enable_ops[cpu]) {
-			pr_err("%s: invalid enable-method property: %s\n",
-			       dn->full_name, enable_method);
+		if (cpu_read_ops(dn, cpu) != 0)
 			goto next;
-		}
 
-		if (smp_enable_ops[cpu]->init_cpu(dn, cpu))
+		if (cpu_ops[cpu]->cpu_init(dn, cpu))
 			goto next;
 
 		pr_debug("cpu logical map 0x%llx\n", hwid);
@@ -380,8 +419,12 @@ next:
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
-	int cpu, err;
-	unsigned int ncores = num_possible_cpus();
+	int err;
+	unsigned int cpu, ncores = num_possible_cpus();
+
+	init_cpu_topology();
+
+	smp_store_cpu_info(smp_processor_id());
 
 	/*
 	 * are we trying to boot more cores than exist?
@@ -408,10 +451,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		if (cpu == smp_processor_id())
 			continue;
 
-		if (!smp_enable_ops[cpu])
+		if (!cpu_ops[cpu])
 			continue;
 
-		err = smp_enable_ops[cpu]->prepare_cpu(cpu);
+		err = cpu_ops[cpu]->cpu_prepare(cpu);
 		if (err)
 			continue;
 
@@ -436,12 +479,22 @@ void arch_send_call_function_single_ipi(int cpu)
 	smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE);
 }
 
+#ifdef CONFIG_IRQ_WORK
+void arch_irq_work_raise(void)
+{
+	if (smp_cross_call)
+		smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
+}
+#endif
+
 static const char *ipi_types[NR_IPI] = {
 #define S(x,s)	[x - IPI_RESCHEDULE] = s
 	S(IPI_RESCHEDULE, "Rescheduling interrupts"),
 	S(IPI_CALL_FUNC, "Function call interrupts"),
 	S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
 	S(IPI_CPU_STOP, "CPU stop interrupts"),
+	S(IPI_TIMER, "Timer broadcast interrupts"),
+	S(IPI_IRQ_WORK, "IRQ work interrupts"),
 };
 
 void show_ipi_list(struct seq_file *p, int prec)
@@ -451,7 +504,7 @@ void show_ipi_list(struct seq_file *p, int prec)
 	for (i = 0; i < NR_IPI; i++) {
 		seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i + IPI_RESCHEDULE,
 			   prec >= 4 ? " " : "");
-		for_each_present_cpu(cpu)
+		for_each_online_cpu(cpu)
 			seq_printf(p, "%10u ",
 				   __get_irq_stat(cpu, ipi_irqs[i]));
 		seq_printf(p, "      %s\n", ipi_types[i]);
@@ -486,7 +539,6 @@ static void ipi_cpu_stop(unsigned int cpu)
 
 	set_cpu_online(cpu, false);
 
-	local_fiq_disable();
 	local_irq_disable();
 
 	while (1)
@@ -527,6 +579,22 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
 		irq_exit();
 		break;
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+	case IPI_TIMER:
+		irq_enter();
+		tick_receive_broadcast();
+		irq_exit();
+		break;
+#endif
+
+#ifdef CONFIG_IRQ_WORK
+	case IPI_IRQ_WORK:
+		irq_enter();
+		irq_work_run();
+		irq_exit();
+		break;
+#endif
+
 	default:
 		pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
 		break;
@@ -539,6 +607,13 @@ void smp_send_reschedule(int cpu)
 	smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE);
 }
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+void tick_broadcast(const struct cpumask *mask)
+{
+	smp_cross_call(mask, IPI_TIMER);
+}
+#endif
+
 void smp_send_stop(void)
 {
 	unsigned long timeout;
diff --git a/arch/arm64/kernel/smp_psci.c b/arch/arm64/kernel/smp_psci.c
deleted file mode 100644
index 0c533301be7..00000000000
--- a/arch/arm64/kernel/smp_psci.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * PSCI SMP initialisation
- *
- * Copyright (C) 2013 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/init.h>
-#include <linux/of.h>
-#include <linux/smp.h>
-
-#include <asm/psci.h>
-#include <asm/smp_plat.h>
-
-static int __init smp_psci_init_cpu(struct device_node *dn, int cpu)
-{
-	return 0;
-}
-
-static int __init smp_psci_prepare_cpu(int cpu)
-{
-	int err;
-
-	if (!psci_ops.cpu_on) {
-		pr_err("psci: no cpu_on method, not booting CPU%d\n", cpu);
-		return -ENODEV;
-	}
-
-	err = psci_ops.cpu_on(cpu_logical_map(cpu), __pa(secondary_holding_pen));
-	if (err) {
-		pr_err("psci: failed to boot CPU%d (%d)\n", cpu, err);
-		return err;
-	}
-
-	return 0;
-}
-
-const struct smp_enable_ops smp_psci_ops __initconst = {
-	.name		= "psci",
-	.init_cpu	= smp_psci_init_cpu,
-	.prepare_cpu	= smp_psci_prepare_cpu,
-};
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index 7c35fa682f7..0347d38eea2 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -16,15 +16,38 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/of.h>
 #include <linux/smp.h>
 
 #include <asm/cacheflush.h>
+#include <asm/cpu_ops.h>
+#include <asm/cputype.h>
+#include <asm/smp_plat.h>
+
+extern void secondary_holding_pen(void);
+volatile unsigned long secondary_holding_pen_release = INVALID_HWID;
 
 static phys_addr_t cpu_release_addr[NR_CPUS];
 
-static int __init smp_spin_table_init_cpu(struct device_node *dn, int cpu)
+/*
+ * Write secondary_holding_pen_release in a way that is guaranteed to be
+ * visible to all observers, irrespective of whether they're taking part
+ * in coherency or not.  This is necessary for the hotplug code to work
+ * reliably.
+ */
+static void write_pen_release(u64 val)
+{
+	void *start = (void *)&secondary_holding_pen_release;
+	unsigned long size = sizeof(secondary_holding_pen_release);
+
+	secondary_holding_pen_release = val;
+	__flush_dcache_area(start, size);
+}
+
+
+static int smp_spin_table_cpu_init(struct device_node *dn, unsigned int cpu)
 {
 	/*
 	 * Determine the address from which the CPU is polling.
@@ -40,7 +63,7 @@ static int __init smp_spin_table_init_cpu(struct device_node *dn, int cpu)
 	return 0;
 }
 
-static int __init smp_spin_table_prepare_cpu(int cpu)
+static int smp_spin_table_cpu_prepare(unsigned int cpu)
 {
 	void **release_addr;
 
@@ -48,7 +71,16 @@ static int __init smp_spin_table_prepare_cpu(int cpu)
 		return -ENODEV;
 
 	release_addr = __va(cpu_release_addr[cpu]);
-	release_addr[0] = (void *)__pa(secondary_holding_pen);
+
+	/*
+	 * We write the release address as LE regardless of the native
+	 * endianess of the kernel. Therefore, any boot-loaders that
+	 * read this address need to convert this address to the
+	 * boot-loader's endianess before jumping. This is mandated by
+	 * the boot protocol.
+	 */
+	release_addr[0] = (void *) cpu_to_le64(__pa(secondary_holding_pen));
+
 	__flush_dcache_area(release_addr, sizeof(release_addr[0]));
 
 	/*
@@ -59,8 +91,24 @@ static int __init smp_spin_table_prepare_cpu(int cpu)
 	return 0;
 }
 
-const struct smp_enable_ops smp_spin_table_ops __initconst = {
+static int smp_spin_table_cpu_boot(unsigned int cpu)
+{
+	/*
+	 * Update the pen release flag.
+	 */
+	write_pen_release(cpu_logical_map(cpu));
+
+	/*
+	 * Send an event, causing the secondaries to read pen_release.
+	 */
+	sev();
+
+	return 0;
+}
+
+const struct cpu_operations smp_spin_table_ops = {
 	.name		= "spin-table",
-	.init_cpu 	= smp_spin_table_init_cpu,
-	.prepare_cpu	= smp_spin_table_prepare_cpu,
+	.cpu_init	= smp_spin_table_cpu_init,
+	.cpu_prepare	= smp_spin_table_cpu_prepare,
+	.cpu_boot	= smp_spin_table_cpu_boot,
 };
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index d25459ff57f..55437ba1f5a 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -35,7 +35,7 @@
  *	ldp	x29, x30, [sp]
  *	add	sp, sp, #0x10
  */
-int unwind_frame(struct stackframe *frame)
+int notrace unwind_frame(struct stackframe *frame)
 {
 	unsigned long high, low;
 	unsigned long fp = frame->fp;
@@ -43,12 +43,16 @@ int unwind_frame(struct stackframe *frame)
 	low  = frame->sp;
 	high = ALIGN(low, THREAD_SIZE);
 
-	if (fp < low || fp > high || fp & 0xf)
+	if (fp < low || fp > high - 0x18 || fp & 0xf)
 		return -EINVAL;
 
 	frame->sp = fp + 0x10;
 	frame->fp = *(unsigned long *)(fp);
-	frame->pc = *(unsigned long *)(fp + 8);
+	/*
+	 * -4 here because we care about the PC at time of bl,
+	 * not where the return will go.
+	 */
+	frame->pc = *(unsigned long *)(fp + 8) - 4;
 
 	return 0;
 }
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
new file mode 100644
index 00000000000..1fa9ce4afd8
--- /dev/null
+++ b/arch/arm64/kernel/suspend.c
@@ -0,0 +1,140 @@
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <asm/cacheflush.h>
+#include <asm/cpu_ops.h>
+#include <asm/debug-monitors.h>
+#include <asm/pgtable.h>
+#include <asm/memory.h>
+#include <asm/smp_plat.h>
+#include <asm/suspend.h>
+#include <asm/tlbflush.h>
+
+extern int __cpu_suspend(unsigned long);
+/*
+ * This is called by __cpu_suspend() to save the state, and do whatever
+ * flushing is required to ensure that when the CPU goes to sleep we have
+ * the necessary data available when the caches are not searched.
+ *
+ * @arg: Argument to pass to suspend operations
+ * @ptr: CPU context virtual address
+ * @save_ptr: address of the location where the context physical address
+ *            must be saved
+ */
+int __cpu_suspend_finisher(unsigned long arg, struct cpu_suspend_ctx *ptr,
+			   phys_addr_t *save_ptr)
+{
+	int cpu = smp_processor_id();
+
+	*save_ptr = virt_to_phys(ptr);
+
+	cpu_do_suspend(ptr);
+	/*
+	 * Only flush the context that must be retrieved with the MMU
+	 * off. VA primitives ensure the flush is applied to all
+	 * cache levels so context is pushed to DRAM.
+	 */
+	__flush_dcache_area(ptr, sizeof(*ptr));
+	__flush_dcache_area(save_ptr, sizeof(*save_ptr));
+
+	return cpu_ops[cpu]->cpu_suspend(arg);
+}
+
+/*
+ * This hook is provided so that cpu_suspend code can restore HW
+ * breakpoints as early as possible in the resume path, before reenabling
+ * debug exceptions. Code cannot be run from a CPU PM notifier since by the
+ * time the notifier runs debug exceptions might have been enabled already,
+ * with HW breakpoints registers content still in an unknown state.
+ */
+void (*hw_breakpoint_restore)(void *);
+void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *))
+{
+	/* Prevent multiple restore hook initializations */
+	if (WARN_ON(hw_breakpoint_restore))
+		return;
+	hw_breakpoint_restore = hw_bp_restore;
+}
+
+/**
+ * cpu_suspend
+ *
+ * @arg: argument to pass to the finisher function
+ */
+int cpu_suspend(unsigned long arg)
+{
+	struct mm_struct *mm = current->active_mm;
+	int ret, cpu = smp_processor_id();
+	unsigned long flags;
+
+	/*
+	 * If cpu_ops have not been registered or suspend
+	 * has not been initialized, cpu_suspend call fails early.
+	 */
+	if (!cpu_ops[cpu] || !cpu_ops[cpu]->cpu_suspend)
+		return -EOPNOTSUPP;
+
+	/*
+	 * From this point debug exceptions are disabled to prevent
+	 * updates to mdscr register (saved and restored along with
+	 * general purpose registers) from kernel debuggers.
+	 */
+	local_dbg_save(flags);
+
+	/*
+	 * mm context saved on the stack, it will be restored when
+	 * the cpu comes out of reset through the identity mapped
+	 * page tables, so that the thread address space is properly
+	 * set-up on function return.
+	 */
+	ret = __cpu_suspend(arg);
+	if (ret == 0) {
+		cpu_switch_mm(mm->pgd, mm);
+		flush_tlb_all();
+
+		/*
+		 * Restore per-cpu offset before any kernel
+		 * subsystem relying on it has a chance to run.
+		 */
+		set_my_cpu_offset(per_cpu_offset(cpu));
+
+		/*
+		 * Restore HW breakpoint registers to sane values
+		 * before debug exceptions are possibly reenabled
+		 * through local_dbg_restore.
+		 */
+		if (hw_breakpoint_restore)
+			hw_breakpoint_restore(NULL);
+	}
+
+	/*
+	 * Restore pstate flags. OS lock and mdscr have been already
+	 * restored, so from this point onwards, debugging is fully
+	 * renabled if it was enabled when core started shutdown.
+	 */
+	local_dbg_restore(flags);
+
+	return ret;
+}
+
+extern struct sleep_save_sp sleep_save_sp;
+extern phys_addr_t sleep_idmap_phys;
+
+static int cpu_suspend_init(void)
+{
+	void *ctx_ptr;
+
+	/* ctx_ptr is an array of physical addresses */
+	ctx_ptr = kcalloc(mpidr_hash_size(), sizeof(phys_addr_t), GFP_KERNEL);
+
+	if (WARN_ON(!ctx_ptr))
+		return -ENOMEM;
+
+	sleep_save_sp.save_ptr_stash = ctx_ptr;
+	sleep_save_sp.save_ptr_stash_phys = virt_to_phys(ctx_ptr);
+	sleep_idmap_phys = virt_to_phys(idmap_pg_dir);
+	__flush_dcache_area(&sleep_save_sp, sizeof(struct sleep_save_sp));
+	__flush_dcache_area(&sleep_idmap_phys, sizeof(sleep_idmap_phys));
+
+	return 0;
+}
+early_initcall(cpu_suspend_init);
diff --git a/arch/arm64/kernel/sys32.S b/arch/arm64/kernel/sys32.S
index a1b19ed7467..423a5b3fc2b 100644
--- a/arch/arm64/kernel/sys32.S
+++ b/arch/arm64/kernel/sys32.S
@@ -59,48 +59,48 @@ ENDPROC(compat_sys_fstatfs64_wrapper)
  * extension.
  */
 compat_sys_pread64_wrapper:
-	orr	x3, x4, x5, lsl #32
+	regs_to_64	x3, x4, x5
 	b	sys_pread64
 ENDPROC(compat_sys_pread64_wrapper)
 
 compat_sys_pwrite64_wrapper:
-	orr	x3, x4, x5, lsl #32
+	regs_to_64	x3, x4, x5
 	b	sys_pwrite64
 ENDPROC(compat_sys_pwrite64_wrapper)
 
 compat_sys_truncate64_wrapper:
-	orr	x1, x2, x3, lsl #32
+	regs_to_64	x1, x2, x3
 	b	sys_truncate
 ENDPROC(compat_sys_truncate64_wrapper)
 
 compat_sys_ftruncate64_wrapper:
-	orr	x1, x2, x3, lsl #32
+	regs_to_64	x1, x2, x3
 	b	sys_ftruncate
 ENDPROC(compat_sys_ftruncate64_wrapper)
 
 compat_sys_readahead_wrapper:
-	orr	x1, x2, x3, lsl #32
+	regs_to_64	x1, x2, x3
 	mov	w2, w4
 	b	sys_readahead
 ENDPROC(compat_sys_readahead_wrapper)
 
 compat_sys_fadvise64_64_wrapper:
 	mov	w6, w1
-	orr	x1, x2, x3, lsl #32
-	orr	x2, x4, x5, lsl #32
+	regs_to_64	x1, x2, x3
+	regs_to_64	x2, x4, x5
 	mov	w3, w6
 	b	sys_fadvise64_64
 ENDPROC(compat_sys_fadvise64_64_wrapper)
 
 compat_sys_sync_file_range2_wrapper:
-	orr	x2, x2, x3, lsl #32
-	orr	x3, x4, x5, lsl #32
+	regs_to_64	x2, x2, x3
+	regs_to_64	x3, x4, x5
 	b	sys_sync_file_range2
 ENDPROC(compat_sys_sync_file_range2_wrapper)
 
 compat_sys_fallocate_wrapper:
-	orr	x2, x2, x3, lsl #32
-	orr	x3, x4, x5, lsl #32
+	regs_to_64	x2, x2, x3
+	regs_to_64	x3, x4, x5
 	b	sys_fallocate
 ENDPROC(compat_sys_fallocate_wrapper)
 
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index 03dc3718eb1..1a7125c3099 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -18,6 +18,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/clockchips.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
@@ -33,6 +34,7 @@
 #include <linux/irq.h>
 #include <linux/delay.h>
 #include <linux/clocksource.h>
+#include <linux/clk-provider.h>
 
 #include <clocksource/arm_arch_timer.h>
 
@@ -61,26 +63,19 @@ unsigned long profile_pc(struct pt_regs *regs)
 EXPORT_SYMBOL(profile_pc);
 #endif
 
-static u64 sched_clock_mult __read_mostly;
-
-unsigned long long notrace sched_clock(void)
-{
-	return arch_timer_read_counter() * sched_clock_mult;
-}
-
 void __init time_init(void)
 {
 	u32 arch_timer_rate;
 
+	of_clk_init(NULL);
 	clocksource_of_init();
 
+	tick_setup_hrtimer_broadcast();
+
 	arch_timer_rate = arch_timer_get_rate();
 	if (!arch_timer_rate)
 		panic("Unable to initialise architected timer.\n");
 
-	/* Cache the sched_clock multiplier to save a divide in the hot path. */
-	sched_clock_mult = NSEC_PER_SEC / arch_timer_rate;
-
 	/* Calibrate the delay loop directly */
 	lpj_fine = arch_timer_rate / HZ;
 }
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
new file mode 100644
index 00000000000..43514f90591
--- /dev/null
+++ b/arch/arm64/kernel/topology.c
@@ -0,0 +1,283 @@
+/*
+ * arch/arm64/kernel/topology.c
+ *
+ * Copyright (C) 2011,2013,2014 Linaro Limited.
+ *
+ * Based on the arm32 version written by Vincent Guittot in turn based on
+ * arch/sh/kernel/topology.c
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/node.h>
+#include <linux/nodemask.h>
+#include <linux/of.h>
+#include <linux/sched.h>
+
+#include <asm/topology.h>
+
+static int __init get_cpu_for_node(struct device_node *node)
+{
+	struct device_node *cpu_node;
+	int cpu;
+
+	cpu_node = of_parse_phandle(node, "cpu", 0);
+	if (!cpu_node)
+		return -1;
+
+	for_each_possible_cpu(cpu) {
+		if (of_get_cpu_node(cpu, NULL) == cpu_node) {
+			of_node_put(cpu_node);
+			return cpu;
+		}
+	}
+
+	pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name);
+
+	of_node_put(cpu_node);
+	return -1;
+}
+
+static int __init parse_core(struct device_node *core, int cluster_id,
+			     int core_id)
+{
+	char name[10];
+	bool leaf = true;
+	int i = 0;
+	int cpu;
+	struct device_node *t;
+
+	do {
+		snprintf(name, sizeof(name), "thread%d", i);
+		t = of_get_child_by_name(core, name);
+		if (t) {
+			leaf = false;
+			cpu = get_cpu_for_node(t);
+			if (cpu >= 0) {
+				cpu_topology[cpu].cluster_id = cluster_id;
+				cpu_topology[cpu].core_id = core_id;
+				cpu_topology[cpu].thread_id = i;
+			} else {
+				pr_err("%s: Can't get CPU for thread\n",
+				       t->full_name);
+				of_node_put(t);
+				return -EINVAL;
+			}
+			of_node_put(t);
+		}
+		i++;
+	} while (t);
+
+	cpu = get_cpu_for_node(core);
+	if (cpu >= 0) {
+		if (!leaf) {
+			pr_err("%s: Core has both threads and CPU\n",
+			       core->full_name);
+			return -EINVAL;
+		}
+
+		cpu_topology[cpu].cluster_id = cluster_id;
+		cpu_topology[cpu].core_id = core_id;
+	} else if (leaf) {
+		pr_err("%s: Can't get CPU for leaf core\n", core->full_name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int __init parse_cluster(struct device_node *cluster, int depth)
+{
+	char name[10];
+	bool leaf = true;
+	bool has_cores = false;
+	struct device_node *c;
+	static int cluster_id __initdata;
+	int core_id = 0;
+	int i, ret;
+
+	/*
+	 * First check for child clusters; we currently ignore any
+	 * information about the nesting of clusters and present the
+	 * scheduler with a flat list of them.
+	 */
+	i = 0;
+	do {
+		snprintf(name, sizeof(name), "cluster%d", i);
+		c = of_get_child_by_name(cluster, name);
+		if (c) {
+			leaf = false;
+			ret = parse_cluster(c, depth + 1);
+			of_node_put(c);
+			if (ret != 0)
+				return ret;
+		}
+		i++;
+	} while (c);
+
+	/* Now check for cores */
+	i = 0;
+	do {
+		snprintf(name, sizeof(name), "core%d", i);
+		c = of_get_child_by_name(cluster, name);
+		if (c) {
+			has_cores = true;
+
+			if (depth == 0) {
+				pr_err("%s: cpu-map children should be clusters\n",
+				       c->full_name);
+				of_node_put(c);
+				return -EINVAL;
+			}
+
+			if (leaf) {
+				ret = parse_core(c, cluster_id, core_id++);
+			} else {
+				pr_err("%s: Non-leaf cluster with core %s\n",
+				       cluster->full_name, name);
+				ret = -EINVAL;
+			}
+
+			of_node_put(c);
+			if (ret != 0)
+				return ret;
+		}
+		i++;
+	} while (c);
+
+	if (leaf && !has_cores)
+		pr_warn("%s: empty cluster\n", cluster->full_name);
+
+	if (leaf)
+		cluster_id++;
+
+	return 0;
+}
+
+static int __init parse_dt_topology(void)
+{
+	struct device_node *cn, *map;
+	int ret = 0;
+	int cpu;
+
+	cn = of_find_node_by_path("/cpus");
+	if (!cn) {
+		pr_err("No CPU information found in DT\n");
+		return 0;
+	}
+
+	/*
+	 * When topology is provided cpu-map is essentially a root
+	 * cluster with restricted subnodes.
+	 */
+	map = of_get_child_by_name(cn, "cpu-map");
+	if (!map)
+		goto out;
+
+	ret = parse_cluster(map, 0);
+	if (ret != 0)
+		goto out_map;
+
+	/*
+	 * Check that all cores are in the topology; the SMP code will
+	 * only mark cores described in the DT as possible.
+	 */
+	for_each_possible_cpu(cpu) {
+		if (cpu_topology[cpu].cluster_id == -1) {
+			pr_err("CPU%d: No topology information specified\n",
+			       cpu);
+			ret = -EINVAL;
+		}
+	}
+
+out_map:
+	of_node_put(map);
+out:
+	of_node_put(cn);
+	return ret;
+}
+
+/*
+ * cpu topology table
+ */
+struct cpu_topology cpu_topology[NR_CPUS];
+EXPORT_SYMBOL_GPL(cpu_topology);
+
+const struct cpumask *cpu_coregroup_mask(int cpu)
+{
+	return &cpu_topology[cpu].core_sibling;
+}
+
+static void update_siblings_masks(unsigned int cpuid)
+{
+	struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
+	int cpu;
+
+	if (cpuid_topo->cluster_id == -1) {
+		/*
+		 * DT does not contain topology information for this cpu.
+		 */
+		pr_debug("CPU%u: No topology information configured\n", cpuid);
+		return;
+	}
+
+	/* update core and thread sibling masks */
+	for_each_possible_cpu(cpu) {
+		cpu_topo = &cpu_topology[cpu];
+
+		if (cpuid_topo->cluster_id != cpu_topo->cluster_id)
+			continue;
+
+		cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
+		if (cpu != cpuid)
+			cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
+
+		if (cpuid_topo->core_id != cpu_topo->core_id)
+			continue;
+
+		cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
+		if (cpu != cpuid)
+			cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
+	}
+}
+
+void store_cpu_topology(unsigned int cpuid)
+{
+	update_siblings_masks(cpuid);
+}
+
+static void __init reset_cpu_topology(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_topology *cpu_topo = &cpu_topology[cpu];
+
+		cpu_topo->thread_id = -1;
+		cpu_topo->core_id = 0;
+		cpu_topo->cluster_id = -1;
+
+		cpumask_clear(&cpu_topo->core_sibling);
+		cpumask_set_cpu(cpu, &cpu_topo->core_sibling);
+		cpumask_clear(&cpu_topo->thread_sibling);
+		cpumask_set_cpu(cpu, &cpu_topo->thread_sibling);
+	}
+}
+
+void __init init_cpu_topology(void)
+{
+	reset_cpu_topology();
+
+	/*
+	 * Discard anything that was parsed if we hit an error so we
+	 * don't use partial information.
+	 */
+	if (parse_dt_topology())
+		reset_cpu_topology();
+}
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 7ffadddb645..c43cfa9b830 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -251,10 +251,13 @@ void die(const char *str, struct pt_regs *regs, int err)
 void arm64_notify_die(const char *str, struct pt_regs *regs,
 		      struct siginfo *info, int err)
 {
-	if (user_mode(regs))
+	if (user_mode(regs)) {
+		current->thread.fault_address = 0;
+		current->thread.fault_code = err;
 		force_sig_info(info->si_signo, info, current);
-	else
+	} else {
 		die(str, regs, err);
+	}
 }
 
 asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 6a389dc1bd4..50384fec56c 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -58,7 +58,10 @@ static struct page *vectors_page[1];
 static int alloc_vectors_page(void)
 {
 	extern char __kuser_helper_start[], __kuser_helper_end[];
+	extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[];
+
 	int kuser_sz = __kuser_helper_end - __kuser_helper_start;
+	int sigret_sz = __aarch32_sigret_code_end - __aarch32_sigret_code_start;
 	unsigned long vpage;
 
 	vpage = get_zeroed_page(GFP_ATOMIC);
@@ -72,7 +75,7 @@ static int alloc_vectors_page(void)
 
 	/* sigreturn code */
 	memcpy((void *)vpage + AARCH32_KERN_SIGRET_CODE_OFFSET,
-		aarch32_sigret_code, sizeof(aarch32_sigret_code));
+               __aarch32_sigret_code_start, sigret_sz);
 
 	flush_icache_range(vpage, vpage + PAGE_SIZE);
 	vectors_page[0] = virt_to_page(vpage);
@@ -103,49 +106,31 @@ int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp)
 
 static int __init vdso_init(void)
 {
-	struct page *pg;
-	char *vbase;
-	int i, ret = 0;
+	int i;
+
+	if (memcmp(&vdso_start, "\177ELF", 4)) {
+		pr_err("vDSO is not a valid ELF object!\n");
+		return -EINVAL;
+	}
 
 	vdso_pages = (&vdso_end - &vdso_start) >> PAGE_SHIFT;
 	pr_info("vdso: %ld pages (%ld code, %ld data) at base %p\n",
 		vdso_pages + 1, vdso_pages, 1L, &vdso_start);
 
 	/* Allocate the vDSO pagelist, plus a page for the data. */
-	vdso_pagelist = kzalloc(sizeof(struct page *) * (vdso_pages + 1),
+	vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *),
 				GFP_KERNEL);
-	if (vdso_pagelist == NULL) {
-		pr_err("Failed to allocate vDSO pagelist!\n");
+	if (vdso_pagelist == NULL)
 		return -ENOMEM;
-	}
 
 	/* Grab the vDSO code pages. */
-	for (i = 0; i < vdso_pages; i++) {
-		pg = virt_to_page(&vdso_start + i*PAGE_SIZE);
-		ClearPageReserved(pg);
-		get_page(pg);
-		vdso_pagelist[i] = pg;
-	}
-
-	/* Sanity check the shared object header. */
-	vbase = vmap(vdso_pagelist, 1, 0, PAGE_KERNEL);
-	if (vbase == NULL) {
-		pr_err("Failed to map vDSO pagelist!\n");
-		return -ENOMEM;
-	} else if (memcmp(vbase, "\177ELF", 4)) {
-		pr_err("vDSO is not a valid ELF object!\n");
-		ret = -EINVAL;
-		goto unmap;
-	}
+	for (i = 0; i < vdso_pages; i++)
+		vdso_pagelist[i] = virt_to_page(&vdso_start + i * PAGE_SIZE);
 
 	/* Grab the vDSO data page. */
-	pg = virt_to_page(vdso_data);
-	get_page(pg);
-	vdso_pagelist[i] = pg;
+	vdso_pagelist[i] = virt_to_page(vdso_data);
 
-unmap:
-	vunmap(vbase);
-	return ret;
+	return 0;
 }
 arch_initcall(vdso_init);
 
@@ -235,6 +220,8 @@ void update_vsyscall(struct timekeeper *tk)
 	vdso_data->use_syscall			= use_syscall;
 	vdso_data->xtime_coarse_sec		= xtime_coarse.tv_sec;
 	vdso_data->xtime_coarse_nsec		= xtime_coarse.tv_nsec;
+	vdso_data->wtm_clock_sec		= tk->wall_to_monotonic.tv_sec;
+	vdso_data->wtm_clock_nsec		= tk->wall_to_monotonic.tv_nsec;
 
 	if (!use_syscall) {
 		vdso_data->cs_cycle_last	= tk->clock->cycle_last;
@@ -242,8 +229,6 @@ void update_vsyscall(struct timekeeper *tk)
 		vdso_data->xtime_clock_nsec	= tk->xtime_nsec;
 		vdso_data->cs_mult		= tk->mult;
 		vdso_data->cs_shift		= tk->shift;
-		vdso_data->wtm_clock_sec	= tk->wall_to_monotonic.tv_sec;
-		vdso_data->wtm_clock_nsec	= tk->wall_to_monotonic.tv_nsec;
 	}
 
 	smp_wmb();
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index d8064af42e6..6d20b7d162d 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -48,7 +48,7 @@ $(obj-vdso): %.o: %.S
 
 # Actual build commands
 quiet_cmd_vdsold = VDSOL $@
-      cmd_vdsold = $(CC) $(c_flags) -Wl,-T $^ -o $@
+      cmd_vdsold = $(CC) $(c_flags) -Wl,-n -Wl,-T $^ -o $@
 quiet_cmd_vdsoas = VDSOA $@
       cmd_vdsoas = $(CC) $(a_flags) -c -o $@ $<
 
diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S
index f0a6d10b521..fe652ffd34c 100644
--- a/arch/arm64/kernel/vdso/gettimeofday.S
+++ b/arch/arm64/kernel/vdso/gettimeofday.S
@@ -103,6 +103,8 @@ ENTRY(__kernel_clock_gettime)
 	bl	__do_get_tspec
 	seqcnt_check w9, 1b
 
+	mov	x30, x2
+
 	cmp	w0, #CLOCK_MONOTONIC
 	b.ne	6f
 
@@ -118,6 +120,9 @@ ENTRY(__kernel_clock_gettime)
 	ccmp	w0, #CLOCK_MONOTONIC_COARSE, #0x4, ne
 	b.ne	8f
 
+	/* xtime_coarse_nsec is already right-shifted */
+	mov	x12, #0
+
 	/* Get coarse timespec. */
 	adr	vdso_data, _vdso_data
 3:	seqcnt_acquire
@@ -156,7 +161,7 @@ ENTRY(__kernel_clock_gettime)
 	lsr	x11, x11, x12
 	stp	x10, x11, [x1, #TSPEC_TV_SEC]
 	mov	x0, xzr
-	ret	x2
+	ret
 7:
 	mov	x30, x2
 8:	/* Syscall fallback. */
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index f8ab9d8e2ea..f1e6d5c032e 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -13,7 +13,7 @@
 #define ARM_EXIT_DISCARD(x)	x
 
 OUTPUT_ARCH(aarch64)
-ENTRY(stext)
+ENTRY(_text)
 
 jiffies = jiffies_64;
 
@@ -54,7 +54,6 @@ SECTIONS
 	}
 	.text : {			/* Real text segment		*/
 		_stext = .;		/* Text and read-only data	*/
-			*(.smp.pen.text)
 			__exception_text_start = .;
 			*(.exception.text)
 			__exception_text_end = .;
@@ -97,37 +96,17 @@ SECTIONS
 	PERCPU_SECTION(64)
 
 	__init_end = .;
-	. = ALIGN(THREAD_SIZE);
-	__data_loc = .;
-
-	.data : AT(__data_loc) {
-		_data = .;		/* address in memory */
-		_sdata = .;
-
-		/*
-		 * first, the init task union, aligned
-		 * to an 8192 byte boundary.
-		 */
-		INIT_TASK_DATA(THREAD_SIZE)
-		NOSAVE_DATA
-		CACHELINE_ALIGNED_DATA(64)
-		READ_MOSTLY_DATA(64)
-
-		/*
-		 * and the usual data section
-		 */
-		DATA_DATA
-		CONSTRUCTORS
-
-		_edata = .;
-	}
-	_edata_loc = __data_loc + SIZEOF(.data);
+
+	. = ALIGN(PAGE_SIZE);
+	_data = .;
+	_sdata = .;
+	RW_DATA_SECTION(64, PAGE_SIZE, THREAD_SIZE)
+	_edata = .;
 
 	BSS_SECTION(0, 0, 0)
 	_end = .;
 
 	STABS_DEBUG
-	.comment 0 : { *(.comment) }
 }
 
 /*