21 files changed, 514 insertions, 377 deletions
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 0ade0acc1ed..0573faab96a 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -6,15 +6,14 @@
 
 lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
 		   csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
-		   delay.o findbit.o memchr.o memcpy.o		      \
+		   delay.o delay-loop.o findbit.o memchr.o memcpy.o   \
 		   memmove.o memset.o memzero.o setbit.o              \
-		   strncpy_from_user.o strnlen_user.o                 \
 		   strchr.o strrchr.o                                 \
 		   testchangebit.o testclearbit.o testsetbit.o        \
 		   ashldi3.o ashrdi3.o lshrdi3.o muldi3.o             \
 		   ucmpdi2.o lib1funcs.o div64.o                      \
 		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o  \
-		   call_with_stack.o
+		   call_with_stack.o bswapsdi2.o
 
 mmu-y	:= clear_user.o copy_page.o getuser.o putuser.o
 
@@ -42,7 +41,12 @@ else
 endif
 
 lib-$(CONFIG_ARCH_RPC)		+= ecard.o io-acorn.o floppydma.o
-lib-$(CONFIG_ARCH_SHARK)	+= io-shark.o
 
 $(obj)/csumpartialcopy.o:	$(obj)/csumpartialcopygeneric.S
 $(obj)/csumpartialcopyuser.o:	$(obj)/csumpartialcopygeneric.S
+
+ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
+  NEON_FLAGS			:= -mfloat-abi=softfp -mfpu=neon
+  CFLAGS_xor-neon.o		+= $(NEON_FLAGS)
+  obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
+endif
diff --git a/arch/arm/lib/backtrace.S b/arch/arm/lib/backtrace.S
index cd07b5814c2..4102be617fc 100644
--- a/arch/arm/lib/backtrace.S
+++ b/arch/arm/lib/backtrace.S
@@ -80,14 +80,14 @@ for_each_frame:	tst	frame, mask		@ Check for address exceptions
 
 		ldr	r1, [sv_pc, #-4]	@ if stmfd sp!, {args} exists,
 		ldr	r3, .Ldsi+4
-		teq	r3, r1, lsr #10
+		teq	r3, r1, lsr #11
 		ldreq	r0, [frame, #-8]	@ get sp
 		subeq	r0, r0, #4		@ point at the last arg
 		bleq	.Ldumpstm		@ dump saved registers
 
 1004:		ldr	r1, [sv_pc, #0]		@ if stmfd sp!, {..., fp, ip, lr, pc}
 		ldr	r3, .Ldsi		@ instruction exists,
-		teq	r3, r1, lsr #10
+		teq	r3, r1, lsr #11
 		subeq	r0, frame, #16
 		bleq	.Ldumpstm		@ dump saved registers
 
@@ -128,11 +128,11 @@ ENDPROC(c_backtrace)
 		beq	2f
 		add	r7, r7, #1
 		teq	r7, #6
-		moveq	r7, #1
-		moveq	r1, #'\n'
-		movne	r1, #' '
-		ldr	r3, [stack], #-4
-		mov	r2, reg
+		moveq	r7, #0
+		adr	r3, .Lcr
+		addne	r3, r3, #1		@ skip newline
+		ldr	r2, [stack], #-4
+		mov	r1, reg
 		adr	r0, .Lfp
 		bl	printk
 2:		subs	reg, reg, #1
@@ -142,11 +142,11 @@ ENDPROC(c_backtrace)
 		blne	printk
 		ldmfd	sp!, {instr, reg, stack, r7, pc}
 
-.Lfp:		.asciz	"%cr%d:%08x"
+.Lfp:		.asciz	" r%d:%08x%s"
 .Lcr:		.asciz	"\n"
 .Lbad:		.asciz	"Backtrace aborted due to bad frame pointer <%p>\n"
 		.align
-.Ldsi:		.word	0xe92dd800 >> 10	@ stmfd sp!, {... fp, ip, lr, pc}
-		.word	0xe92d0000 >> 10	@ stmfd sp!, {}
+.Ldsi:		.word	0xe92dd800 >> 11	@ stmfd sp!, {... fp, ip, lr, pc}
+		.word	0xe92d0000 >> 11	@ stmfd sp!, {}
 
 #endif
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h
index d6408d1ee54..9f12ed1eea8 100644
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -10,6 +10,11 @@ UNWIND(	.fnstart	)
 	and	r3, r0, #31		@ Get bit offset
 	mov	r0, r0, lsr #5
 	add	r1, r1, r0, lsl #2	@ Get word offset
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
+	.arch_extension	mp
+	ALT_SMP(W(pldw)	[r1])
+	ALT_UP(W(nop))
+#endif
 	mov	r3, r2, lsl r3
 1:	ldrex	r2, [r1]
 	\instr	r2, r2, r3
@@ -32,6 +37,11 @@ UNWIND(	.fnstart	)
 	add	r1, r1, r0, lsl #2	@ Get word offset
 	mov	r3, r2, lsl r3		@ create mask
 	smp_dmb
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
+	.arch_extension	mp
+	ALT_SMP(W(pldw)	[r1])
+	ALT_UP(W(nop))
+#endif
 1:	ldrex	r2, [r1]
 	ands	r0, r2, r3		@ save old value of bit
 	\instr	r2, r2, r3		@ toggle bit
diff --git a/arch/arm/lib/bswapsdi2.S b/arch/arm/lib/bswapsdi2.S
new file mode 100644
index 00000000000..9fcdd154eff
--- /dev/null
+++ b/arch/arm/lib/bswapsdi2.S
@@ -0,0 +1,36 @@
+#include <linux/linkage.h>
+
+#if __LINUX_ARM_ARCH__ >= 6
+ENTRY(__bswapsi2)
+	rev r0, r0
+	bx lr
+ENDPROC(__bswapsi2)
+
+ENTRY(__bswapdi2)
+	rev r3, r0
+	rev r0, r1
+	mov r1, r3
+	bx lr
+ENDPROC(__bswapdi2)
+#else
+ENTRY(__bswapsi2)
+	eor r3, r0, r0, ror #16
+	mov r3, r3, lsr #8
+	bic r3, r3, #0xff00
+	eor r0, r3, r0, ror #8
+	mov pc, lr
+ENDPROC(__bswapsi2)
+
+ENTRY(__bswapdi2)
+	mov ip, r1
+	eor r3, ip, ip, ror #16
+	eor r1, r0, r0, ror #16
+	mov r1, r1, lsr #8
+	mov r3, r3, lsr #8
+	bic r3, r3, #0xff00
+	bic r1, r1, #0xff00
+	eor r1, r1, r0, ror #8
+	eor r0, r3, ip, ror #8
+	mov pc, lr
+ENDPROC(__bswapdi2)
+#endif
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 805e3f8fb00..3bc8eb811a7 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -197,24 +197,24 @@
 
 12:	PLD(	pld	[r1, #124]		)
 13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
-		mov	r3, lr, pull #\pull
+		mov	r3, lr, lspull #\pull
 		subs	r2, r2, #32
 		ldr4w	r1, r8, r9, ip, lr, abort=19f
-		orr	r3, r3, r4, push #\push
-		mov	r4, r4, pull #\pull
-		orr	r4, r4, r5, push #\push
-		mov	r5, r5, pull #\pull
-		orr	r5, r5, r6, push #\push
-		mov	r6, r6, pull #\pull
-		orr	r6, r6, r7, push #\push
-		mov	r7, r7, pull #\pull
-		orr	r7, r7, r8, push #\push
-		mov	r8, r8, pull #\pull
-		orr	r8, r8, r9, push #\push
-		mov	r9, r9, pull #\pull
-		orr	r9, r9, ip, push #\push
-		mov	ip, ip, pull #\pull
-		orr	ip, ip, lr, push #\push
+		orr	r3, r3, r4, lspush #\push
+		mov	r4, r4, lspull #\pull
+		orr	r4, r4, r5, lspush #\push
+		mov	r5, r5, lspull #\pull
+		orr	r5, r5, r6, lspush #\push
+		mov	r6, r6, lspull #\pull
+		orr	r6, r6, r7, lspush #\push
+		mov	r7, r7, lspull #\pull
+		orr	r7, r7, r8, lspush #\push
+		mov	r8, r8, lspull #\pull
+		orr	r8, r8, r9, lspush #\push
+		mov	r9, r9, lspull #\pull
+		orr	r9, r9, ip, lspush #\push
+		mov	ip, ip, lspull #\pull
+		orr	ip, ip, lr, lspush #\push
 		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
 		bge	12b
 	PLD(	cmn	r2, #96			)
@@ -225,10 +225,10 @@
 14:		ands	ip, r2, #28
 		beq	16f
 
-15:		mov	r3, lr, pull #\pull
+15:		mov	r3, lr, lspull #\pull
 		ldr1w	r1, lr, abort=21f
 		subs	ip, ip, #4
-		orr	r3, r3, lr, push #\push
+		orr	r3, r3, lr, lspush #\push
 		str1w	r0, r3, abort=21f
 		bgt	15b
 	CALGN(	cmp	r2, #0			)
diff --git a/arch/arm/lib/csumpartialcopygeneric.S b/arch/arm/lib/csumpartialcopygeneric.S
index d620a5f22a0..d6e742d2400 100644
--- a/arch/arm/lib/csumpartialcopygeneric.S
+++ b/arch/arm/lib/csumpartialcopygeneric.S
@@ -141,7 +141,7 @@ FN_ENTRY
 		tst	len, #2
 		mov	r5, r4, get_byte_0
 		beq	.Lexit
-		adcs	sum, sum, r4, push #16
+		adcs	sum, sum, r4, lspush #16
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_1
 		strb	r5, [dst], #1
@@ -171,23 +171,23 @@ FN_ENTRY
 		cmp	ip, #2
 		beq	.Lsrc2_aligned
 		bhi	.Lsrc3_aligned
-		mov	r4, r5, pull #8		@ C = 0
+		mov	r4, r5, lspull #8		@ C = 0
 		bics	ip, len, #15
 		beq	2f
 1:		load4l	r5, r6, r7, r8
-		orr	r4, r4, r5, push #24
-		mov	r5, r5, pull #8
-		orr	r5, r5, r6, push #24
-		mov	r6, r6, pull #8
-		orr	r6, r6, r7, push #24
-		mov	r7, r7, pull #8
-		orr	r7, r7, r8, push #24
+		orr	r4, r4, r5, lspush #24
+		mov	r5, r5, lspull #8
+		orr	r5, r5, r6, lspush #24
+		mov	r6, r6, lspull #8
+		orr	r6, r6, r7, lspush #24
+		mov	r7, r7, lspull #8
+		orr	r7, r7, r8, lspush #24
 		stmia	dst!, {r4, r5, r6, r7}
 		adcs	sum, sum, r4
 		adcs	sum, sum, r5
 		adcs	sum, sum, r6
 		adcs	sum, sum, r7
-		mov	r4, r8, pull #8
+		mov	r4, r8, lspull #8
 		sub	ip, ip, #16
 		teq	ip, #0
 		bne	1b
@@ -196,50 +196,50 @@ FN_ENTRY
 		tst	ip, #8
 		beq	3f
 		load2l	r5, r6
-		orr	r4, r4, r5, push #24
-		mov	r5, r5, pull #8
-		orr	r5, r5, r6, push #24
+		orr	r4, r4, r5, lspush #24
+		mov	r5, r5, lspull #8
+		orr	r5, r5, r6, lspush #24
 		stmia	dst!, {r4, r5}
 		adcs	sum, sum, r4
 		adcs	sum, sum, r5
-		mov	r4, r6, pull #8
+		mov	r4, r6, lspull #8
 		tst	ip, #4
 		beq	4f
 3:		load1l	r5
-		orr	r4, r4, r5, push #24
+		orr	r4, r4, r5, lspush #24
 		str	r4, [dst], #4
 		adcs	sum, sum, r4
-		mov	r4, r5, pull #8
+		mov	r4, r5, lspull #8
 4:		ands	len, len, #3
 		beq	.Ldone
 		mov	r5, r4, get_byte_0
 		tst	len, #2
 		beq	.Lexit
-		adcs	sum, sum, r4, push #16
+		adcs	sum, sum, r4, lspush #16
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_1
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_2
 		b	.Lexit
 
-.Lsrc2_aligned:	mov	r4, r5, pull #16
+.Lsrc2_aligned:	mov	r4, r5, lspull #16
 		adds	sum, sum, #0
 		bics	ip, len, #15
 		beq	2f
 1:		load4l	r5, r6, r7, r8
-		orr	r4, r4, r5, push #16
-		mov	r5, r5, pull #16
-		orr	r5, r5, r6, push #16
-		mov	r6, r6, pull #16
-		orr	r6, r6, r7, push #16
-		mov	r7, r7, pull #16
-		orr	r7, r7, r8, push #16
+		orr	r4, r4, r5, lspush #16
+		mov	r5, r5, lspull #16
+		orr	r5, r5, r6, lspush #16
+		mov	r6, r6, lspull #16
+		orr	r6, r6, r7, lspush #16
+		mov	r7, r7, lspull #16
+		orr	r7, r7, r8, lspush #16
 		stmia	dst!, {r4, r5, r6, r7}
 		adcs	sum, sum, r4
 		adcs	sum, sum, r5
 		adcs	sum, sum, r6
 		adcs	sum, sum, r7
-		mov	r4, r8, pull #16
+		mov	r4, r8, lspull #16
 		sub	ip, ip, #16
 		teq	ip, #0
 		bne	1b
@@ -248,20 +248,20 @@ FN_ENTRY
 		tst	ip, #8
 		beq	3f
 		load2l	r5, r6
-		orr	r4, r4, r5, push #16
-		mov	r5, r5, pull #16
-		orr	r5, r5, r6, push #16
+		orr	r4, r4, r5, lspush #16
+		mov	r5, r5, lspull #16
+		orr	r5, r5, r6, lspush #16
 		stmia	dst!, {r4, r5}
 		adcs	sum, sum, r4
 		adcs	sum, sum, r5
-		mov	r4, r6, pull #16
+		mov	r4, r6, lspull #16
 		tst	ip, #4
 		beq	4f
 3:		load1l	r5
-		orr	r4, r4, r5, push #16
+		orr	r4, r4, r5, lspush #16
 		str	r4, [dst], #4
 		adcs	sum, sum, r4
-		mov	r4, r5, pull #16
+		mov	r4, r5, lspull #16
 4:		ands	len, len, #3
 		beq	.Ldone
 		mov	r5, r4, get_byte_0
@@ -276,24 +276,24 @@ FN_ENTRY
 		load1b	r5
 		b	.Lexit
 
-.Lsrc3_aligned:	mov	r4, r5, pull #24
+.Lsrc3_aligned:	mov	r4, r5, lspull #24
 		adds	sum, sum, #0
 		bics	ip, len, #15
 		beq	2f
 1:		load4l	r5, r6, r7, r8
-		orr	r4, r4, r5, push #8
-		mov	r5, r5, pull #24
-		orr	r5, r5, r6, push #8
-		mov	r6, r6, pull #24
-		orr	r6, r6, r7, push #8
-		mov	r7, r7, pull #24
-		orr	r7, r7, r8, push #8
+		orr	r4, r4, r5, lspush #8
+		mov	r5, r5, lspull #24
+		orr	r5, r5, r6, lspush #8
+		mov	r6, r6, lspull #24
+		orr	r6, r6, r7, lspush #8
+		mov	r7, r7, lspull #24
+		orr	r7, r7, r8, lspush #8
 		stmia	dst!, {r4, r5, r6, r7}
 		adcs	sum, sum, r4
 		adcs	sum, sum, r5
 		adcs	sum, sum, r6
 		adcs	sum, sum, r7
-		mov	r4, r8, pull #24
+		mov	r4, r8, lspull #24
 		sub	ip, ip, #16
 		teq	ip, #0
 		bne	1b
@@ -302,20 +302,20 @@ FN_ENTRY
 		tst	ip, #8
 		beq	3f
 		load2l	r5, r6
-		orr	r4, r4, r5, push #8
-		mov	r5, r5, pull #24
-		orr	r5, r5, r6, push #8
+		orr	r4, r4, r5, lspush #8
+		mov	r5, r5, lspull #24
+		orr	r5, r5, r6, lspush #8
 		stmia	dst!, {r4, r5}
 		adcs	sum, sum, r4
 		adcs	sum, sum, r5
-		mov	r4, r6, pull #24
+		mov	r4, r6, lspull #24
 		tst	ip, #4
 		beq	4f
 3:		load1l	r5
-		orr	r4, r4, r5, push #8
+		orr	r4, r4, r5, lspush #8
 		str	r4, [dst], #4
 		adcs	sum, sum, r4
-		mov	r4, r5, pull #24
+		mov	r4, r5, lspull #24
 4:		ands	len, len, #3
 		beq	.Ldone
 		mov	r5, r4, get_byte_0
@@ -326,7 +326,7 @@ FN_ENTRY
 		load1l	r4
 		mov	r5, r4, get_byte_0
 		strb	r5, [dst], #1
-		adcs	sum, sum, r4, push #24
+		adcs	sum, sum, r4, lspush #24
 		mov	r5, r4, get_byte_1
 		b	.Lexit
 FN_EXIT
diff --git a/arch/arm/lib/delay.S b/arch/arm/lib/delay-loop.S
index 3c9a05c8d20..bc1033b897b 100644
--- a/arch/arm/lib/delay.S
+++ b/arch/arm/lib/delay-loop.S
@@ -9,11 +9,11 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/param.h>
+#include <asm/delay.h>
 		.text
 
 .LC0:		.word	loops_per_jiffy
-.LC1:		.word	(2199023*HZ)>>11
+.LC1:		.word	UDELAY_MULT
 
 /*
  * r0  <= 2000
@@ -21,10 +21,10 @@
  * HZ  <= 1000
  */
 
-ENTRY(__udelay)
+ENTRY(__loop_udelay)
 		ldr	r2, .LC1
 		mul	r0, r2, r0
-ENTRY(__const_udelay)				@ 0 <= r0 <= 0x7fffff06
+ENTRY(__loop_const_udelay)			@ 0 <= r0 <= 0x7fffff06
 		mov	r1, #-1
 		ldr	r2, .LC0
 		ldr	r2, [r2]		@ max = 0x01ffffff
@@ -39,12 +39,11 @@ ENTRY(__const_udelay)				@ 0 <= r0 <= 0x7fffff06
 
 /*
  * loops = r0 * HZ * loops_per_jiffy / 1000000
- *
- * Oh, if only we had a cycle counter...
  */
+		.align 3
 
 @ Delay routine
-ENTRY(__delay)
+ENTRY(__loop_delay)
 		subs	r0, r0, #1
 #if 0
 		movls	pc, lr
@@ -62,8 +61,8 @@ ENTRY(__delay)
 		movls	pc, lr
 		subs	r0, r0, #1
 #endif
-		bhi	__delay
+		bhi	__loop_delay
 		mov	pc, lr
-ENDPROC(__udelay)
-ENDPROC(__const_udelay)
-ENDPROC(__delay)
+ENDPROC(__loop_udelay)
+ENDPROC(__loop_const_udelay)
+ENDPROC(__loop_delay)
diff --git a/arch/arm/lib/delay.c b/arch/arm/lib/delay.c
new file mode 100644
index 00000000000..5306de35013
--- /dev/null
+++ b/arch/arm/lib/delay.c
@@ -0,0 +1,93 @@
+/*
+ * Delay loops based on the OpenRISC implementation.
+ *
+ * Copyright (C) 2012 ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timex.h>
+
+/*
+ * Default to the loop-based delay implementation.
+ */
+struct arm_delay_ops arm_delay_ops = {
+	.delay		= __loop_delay,
+	.const_udelay	= __loop_const_udelay,
+	.udelay		= __loop_udelay,
+};
+
+static const struct delay_timer *delay_timer;
+static bool delay_calibrated;
+
+int read_current_timer(unsigned long *timer_val)
+{
+	if (!delay_timer)
+		return -ENXIO;
+
+	*timer_val = delay_timer->read_current_timer();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(read_current_timer);
+
+static void __timer_delay(unsigned long cycles)
+{
+	cycles_t start = get_cycles();
+
+	while ((get_cycles() - start) < cycles)
+		cpu_relax();
+}
+
+static void __timer_const_udelay(unsigned long xloops)
+{
+	unsigned long long loops = xloops;
+	loops *= arm_delay_ops.ticks_per_jiffy;
+	__timer_delay(loops >> UDELAY_SHIFT);
+}
+
+static void __timer_udelay(unsigned long usecs)
+{
+	__timer_const_udelay(usecs * UDELAY_MULT);
+}
+
+void __init register_current_timer_delay(const struct delay_timer *timer)
+{
+	if (!delay_calibrated) {
+		pr_info("Switching to timer-based delay loop\n");
+		delay_timer			= timer;
+		lpj_fine			= timer->freq / HZ;
+
+		/* cpufreq may scale loops_per_jiffy, so keep a private copy */
+		arm_delay_ops.ticks_per_jiffy	= lpj_fine;
+		arm_delay_ops.delay		= __timer_delay;
+		arm_delay_ops.const_udelay	= __timer_const_udelay;
+		arm_delay_ops.udelay		= __timer_udelay;
+
+		delay_calibrated		= true;
+	} else {
+		pr_info("Ignoring duplicate/late registration of read_current_timer delay\n");
+	}
+}
+
+unsigned long calibrate_delay_is_known(void)
+{
+	delay_calibrated = true;
+	return lpj_fine;
+}
diff --git a/arch/arm/lib/getuser.S b/arch/arm/lib/getuser.S
index 11093a7c3e3..9b06bb41fca 100644
--- a/arch/arm/lib/getuser.S
+++ b/arch/arm/lib/getuser.S
@@ -16,8 +16,9 @@
  * __get_user_X
  *
  * Inputs:	r0 contains the address
+ *		r1 contains the address limit, which must be preserved
  * Outputs:	r0 is the error code
- *		r2, r3 contains the zero-extended value
+ *		r2 contains the zero-extended value
  *		lr corrupted
  *
  * No other registers must be altered.  (see <asm/uaccess.h>
@@ -27,33 +28,39 @@
  * Note also that it is intended that __get_user_bad is not global.
  */
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 #include <asm/errno.h>
 #include <asm/domain.h>
 
 ENTRY(__get_user_1)
+	check_uaccess r0, 1, r1, r2, __get_user_bad
 1: TUSER(ldrb)	r2, [r0]
 	mov	r0, #0
 	mov	pc, lr
 ENDPROC(__get_user_1)
 
 ENTRY(__get_user_2)
-#ifdef CONFIG_THUMB2_KERNEL
-2: TUSER(ldrb)	r2, [r0]
-3: TUSER(ldrb)	r3, [r0, #1]
+	check_uaccess r0, 2, r1, r2, __get_user_bad
+#ifdef CONFIG_CPU_USE_DOMAINS
+rb	.req	ip
+2:	ldrbt	r2, [r0], #1
+3:	ldrbt	rb, [r0], #0
 #else
-2: TUSER(ldrb)	r2, [r0], #1
-3: TUSER(ldrb)	r3, [r0]
+rb	.req	r0
+2:	ldrb	r2, [r0]
+3:	ldrb	rb, [r0, #1]
 #endif
 #ifndef __ARMEB__
-	orr	r2, r2, r3, lsl #8
+	orr	r2, r2, rb, lsl #8
 #else
-	orr	r2, r3, r2, lsl #8
+	orr	r2, rb, r2, lsl #8
 #endif
 	mov	r0, #0
 	mov	pc, lr
 ENDPROC(__get_user_2)
 
 ENTRY(__get_user_4)
+	check_uaccess r0, 4, r1, r2, __get_user_bad
 4: TUSER(ldr)	r2, [r0]
 	mov	r0, #0
 	mov	pc, lr
diff --git a/arch/arm/lib/io-acorn.S b/arch/arm/lib/io-acorn.S
index 1b197ea7aab..69719bad674 100644
--- a/arch/arm/lib/io-acorn.S
+++ b/arch/arm/lib/io-acorn.S
@@ -11,13 +11,14 @@
  *
  */
 #include <linux/linkage.h>
+#include <linux/kern_levels.h>
 #include <asm/assembler.h>
 
 		.text
 		.align
 
 .Liosl_warning:
-		.ascii	"<4>insl/outsl not implemented, called from %08lX\0"
+		.ascii	KERN_WARNING "insl/outsl not implemented, called from %08lX\0"
 		.align
 
 /*
diff --git a/arch/arm/lib/io-readsl.S b/arch/arm/lib/io-readsl.S
index 5fb97e7f9f4..7a7430950c7 100644
--- a/arch/arm/lib/io-readsl.S
+++ b/arch/arm/lib/io-readsl.S
@@ -47,25 +47,25 @@ ENTRY(__raw_readsl)
 		strb	ip, [r1], #1
 
 4:		subs	r2, r2, #1
-		mov	ip, r3, pull #24
+		mov	ip, r3, lspull #24
 		ldrne	r3, [r0]
-		orrne	ip, ip, r3, push #8
+		orrne	ip, ip, r3, lspush #8
 		strne	ip, [r1], #4
 		bne	4b
 		b	8f
 
 5:		subs	r2, r2, #1
-		mov	ip, r3, pull #16
+		mov	ip, r3, lspull #16
 		ldrne	r3, [r0]
-		orrne	ip, ip, r3, push #16
+		orrne	ip, ip, r3, lspush #16
 		strne	ip, [r1], #4
 		bne	5b
 		b	7f
 
 6:		subs	r2, r2, #1
-		mov	ip, r3, pull #8
+		mov	ip, r3, lspull #8
 		ldrne	r3, [r0]
-		orrne	ip, ip, r3, push #24
+		orrne	ip, ip, r3, lspush #24
 		strne	ip, [r1], #4
 		bne	6b
 
diff --git a/arch/arm/lib/io-shark.c b/arch/arm/lib/io-shark.c
deleted file mode 100644
index 824253948f5..00000000000
--- a/arch/arm/lib/io-shark.c
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- *  linux/arch/arm/lib/io-shark.c
- *
- *  by Alexander Schulz
- *
- * derived from:
- * linux/arch/arm/lib/io-ebsa.S
- * Copyright (C) 1995, 1996 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
diff --git a/arch/arm/lib/io-writesl.S b/arch/arm/lib/io-writesl.S
index 8d3b7813725..d0d104a0dd1 100644
--- a/arch/arm/lib/io-writesl.S
+++ b/arch/arm/lib/io-writesl.S
@@ -41,26 +41,26 @@ ENTRY(__raw_writesl)
 		blt	5f
 		bgt	6f
 
-4:		mov	ip, r3, pull #16
+4:		mov	ip, r3, lspull #16
 		ldr	r3, [r1], #4
 		subs	r2, r2, #1
-		orr	ip, ip, r3, push #16
+		orr	ip, ip, r3, lspush #16
 		str	ip, [r0]
 		bne	4b
 		mov	pc, lr
 
-5:		mov	ip, r3, pull #8
+5:		mov	ip, r3, lspull #8
 		ldr	r3, [r1], #4
 		subs	r2, r2, #1
-		orr	ip, ip, r3, push #24
+		orr	ip, ip, r3, lspush #24
 		str	ip, [r0]
 		bne	5b
 		mov	pc, lr
 
-6:		mov	ip, r3, pull #24
+6:		mov	ip, r3, lspull #24
 		ldr	r3, [r1], #4
 		subs	r2, r2, #1
-		orr	ip, ip, r3, push #8
+		orr	ip, ip, r3, lspush #8
 		str	ip, [r0]
 		bne	6b
 		mov	pc, lr
diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S
index 938fc14f962..d1fc0c0c342 100644
--- a/arch/arm/lib/memmove.S
+++ b/arch/arm/lib/memmove.S
@@ -147,24 +147,24 @@ ENTRY(memmove)
 
 12:	PLD(	pld	[r1, #-128]		)
 13:		ldmdb   r1!, {r7, r8, r9, ip}
-		mov     lr, r3, push #\push
+		mov     lr, r3, lspush #\push
 		subs    r2, r2, #32
 		ldmdb   r1!, {r3, r4, r5, r6}
-		orr     lr, lr, ip, pull #\pull
-		mov     ip, ip, push #\push
-		orr     ip, ip, r9, pull #\pull
-		mov     r9, r9, push #\push
-		orr     r9, r9, r8, pull #\pull
-		mov     r8, r8, push #\push
-		orr     r8, r8, r7, pull #\pull
-		mov     r7, r7, push #\push
-		orr     r7, r7, r6, pull #\pull
-		mov     r6, r6, push #\push
-		orr     r6, r6, r5, pull #\pull
-		mov     r5, r5, push #\push
-		orr     r5, r5, r4, pull #\pull
-		mov     r4, r4, push #\push
-		orr     r4, r4, r3, pull #\pull
+		orr     lr, lr, ip, lspull #\pull
+		mov     ip, ip, lspush #\push
+		orr     ip, ip, r9, lspull #\pull
+		mov     r9, r9, lspush #\push
+		orr     r9, r9, r8, lspull #\pull
+		mov     r8, r8, lspush #\push
+		orr     r8, r8, r7, lspull #\pull
+		mov     r7, r7, lspush #\push
+		orr     r7, r7, r6, lspull #\pull
+		mov     r6, r6, lspush #\push
+		orr     r6, r6, r5, lspull #\pull
+		mov     r5, r5, lspush #\push
+		orr     r5, r5, r4, lspull #\pull
+		mov     r4, r4, lspush #\push
+		orr     r4, r4, r3, lspull #\pull
 		stmdb   r0!, {r4 - r9, ip, lr}
 		bge	12b
 	PLD(	cmn	r2, #96			)
@@ -175,10 +175,10 @@ ENTRY(memmove)
 14:		ands	ip, r2, #28
 		beq	16f
 
-15:		mov     lr, r3, push #\push
+15:		mov     lr, r3, lspush #\push
 		ldr	r3, [r1, #-4]!
 		subs	ip, ip, #4
-		orr	lr, lr, r3, pull #\pull
+		orr	lr, lr, r3, lspull #\pull
 		str	lr, [r0, #-4]!
 		bgt	15b
 	CALGN(	cmp	r2, #0			)
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
index 650d5923ab8..94b0650ea98 100644
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -14,27 +14,15 @@
 
 	.text
 	.align	5
-	.word	0
-
-1:	subs	r2, r2, #4		@ 1 do we have enough
-	blt	5f			@ 1 bytes to align with?
-	cmp	r3, #2			@ 1
-	strltb	r1, [r0], #1		@ 1
-	strleb	r1, [r0], #1		@ 1
-	strb	r1, [r0], #1		@ 1
-	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
-/*
- * The pointer is now aligned and the length is adjusted.  Try doing the
- * memset again.
- */
 
 ENTRY(memset)
 	ands	r3, r0, #3		@ 1 unaligned?
-	bne	1b			@ 1
+	mov	ip, r0			@ preserve r0 as return value
+	bne	6f			@ 1
 /*
- * we know that the pointer in r0 is aligned to a word boundary.
+ * we know that the pointer in ip is aligned to a word boundary.
  */
-	orr	r1, r1, r1, lsl #8
+1:	orr	r1, r1, r1, lsl #8
 	orr	r1, r1, r1, lsl #16
 	mov	r3, r1
 	cmp	r2, #16
@@ -43,29 +31,28 @@ ENTRY(memset)
 #if ! CALGN(1)+0
 
 /*
- * We need an extra register for this loop - save the return address and
- * use the LR
+ * We need 2 extra registers for this loop - use r8 and the LR
  */
-	str	lr, [sp, #-4]!
-	mov	ip, r1
+	stmfd	sp!, {r8, lr}
+	mov	r8, r1
 	mov	lr, r1
 
 2:	subs	r2, r2, #64
-	stmgeia	r0!, {r1, r3, ip, lr}	@ 64 bytes at a time.
-	stmgeia	r0!, {r1, r3, ip, lr}
-	stmgeia	r0!, {r1, r3, ip, lr}
-	stmgeia	r0!, {r1, r3, ip, lr}
+	stmgeia	ip!, {r1, r3, r8, lr}	@ 64 bytes at a time.
+	stmgeia	ip!, {r1, r3, r8, lr}
+	stmgeia	ip!, {r1, r3, r8, lr}
+	stmgeia	ip!, {r1, r3, r8, lr}
 	bgt	2b
-	ldmeqfd	sp!, {pc}		@ Now <64 bytes to go.
+	ldmeqfd	sp!, {r8, pc}		@ Now <64 bytes to go.
 /*
  * No need to correct the count; we're only testing bits from now on
  */
 	tst	r2, #32
-	stmneia	r0!, {r1, r3, ip, lr}
-	stmneia	r0!, {r1, r3, ip, lr}
+	stmneia	ip!, {r1, r3, r8, lr}
+	stmneia	ip!, {r1, r3, r8, lr}
 	tst	r2, #16
-	stmneia	r0!, {r1, r3, ip, lr}
-	ldr	lr, [sp], #4
+	stmneia	ip!, {r1, r3, r8, lr}
+	ldmfd	sp!, {r8, lr}
 
 #else
 
@@ -74,54 +61,63 @@ ENTRY(memset)
  * whole cache lines at once.
  */
 
-	stmfd	sp!, {r4-r7, lr}
+	stmfd	sp!, {r4-r8, lr}
 	mov	r4, r1
 	mov	r5, r1
 	mov	r6, r1
 	mov	r7, r1
-	mov	ip, r1
+	mov	r8, r1
 	mov	lr, r1
 
 	cmp	r2, #96
-	tstgt	r0, #31
+	tstgt	ip, #31
 	ble	3f
 
-	and	ip, r0, #31
-	rsb	ip, ip, #32
-	sub	r2, r2, ip
-	movs	ip, ip, lsl #(32 - 4)
-	stmcsia	r0!, {r4, r5, r6, r7}
-	stmmiia	r0!, {r4, r5}
-	tst	ip, #(1 << 30)
-	mov	ip, r1
-	strne	r1, [r0], #4
+	and	r8, ip, #31
+	rsb	r8, r8, #32
+	sub	r2, r2, r8
+	movs	r8, r8, lsl #(32 - 4)
+	stmcsia	ip!, {r4, r5, r6, r7}
+	stmmiia	ip!, {r4, r5}
+	tst	r8, #(1 << 30)
+	mov	r8, r1
+	strne	r1, [ip], #4
 
 3:	subs	r2, r2, #64
-	stmgeia	r0!, {r1, r3-r7, ip, lr}
-	stmgeia	r0!, {r1, r3-r7, ip, lr}
+	stmgeia	ip!, {r1, r3-r8, lr}
+	stmgeia	ip!, {r1, r3-r8, lr}
 	bgt	3b
-	ldmeqfd	sp!, {r4-r7, pc}
+	ldmeqfd	sp!, {r4-r8, pc}
 
 	tst	r2, #32
-	stmneia	r0!, {r1, r3-r7, ip, lr}
+	stmneia	ip!, {r1, r3-r8, lr}
 	tst	r2, #16
-	stmneia	r0!, {r4-r7}
-	ldmfd	sp!, {r4-r7, lr}
+	stmneia	ip!, {r4-r7}
+	ldmfd	sp!, {r4-r8, lr}
 
 #endif
 
 4:	tst	r2, #8
-	stmneia	r0!, {r1, r3}
+	stmneia	ip!, {r1, r3}
 	tst	r2, #4
-	strne	r1, [r0], #4
+	strne	r1, [ip], #4
 /*
  * When we get here, we've got less than 4 bytes to zero.  We
  * may have an unaligned pointer as well.
  */
 5:	tst	r2, #2
-	strneb	r1, [r0], #1
-	strneb	r1, [r0], #1
+	strneb	r1, [ip], #1
+	strneb	r1, [ip], #1
 	tst	r2, #1
-	strneb	r1, [r0], #1
+	strneb	r1, [ip], #1
 	mov	pc, lr
+
+6:	subs	r2, r2, #4		@ 1 do we have enough
+	blt	5b			@ 1 bytes to align with?
+	cmp	r3, #2			@ 1
+	strltb	r1, [ip], #1		@ 1
+	strleb	r1, [ip], #1		@ 1
+	strb	r1, [ip], #1		@ 1
+	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
+	b	1b
 ENDPROC(memset)
diff --git a/arch/arm/lib/putuser.S b/arch/arm/lib/putuser.S
index 7db25990c58..3d73dcb959b 100644
--- a/arch/arm/lib/putuser.S
+++ b/arch/arm/lib/putuser.S
@@ -16,6 +16,7 @@
  * __put_user_X
  *
  * Inputs:	r0 contains the address
+ *		r1 contains the address limit, which must be preserved
  *		r2, r3 contains the value
  * Outputs:	r0 is the error code
  *		lr corrupted
@@ -27,16 +28,19 @@
  * Note also that it is intended that __put_user_bad is not global.
  */
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 #include <asm/errno.h>
 #include <asm/domain.h>
 
 ENTRY(__put_user_1)
+	check_uaccess r0, 1, r1, ip, __put_user_bad
 1: TUSER(strb)	r2, [r0]
 	mov	r0, #0
 	mov	pc, lr
 ENDPROC(__put_user_1)
 
 ENTRY(__put_user_2)
+	check_uaccess r0, 2, r1, ip, __put_user_bad
 	mov	ip, r2, lsr #8
 #ifdef CONFIG_THUMB2_KERNEL
 #ifndef __ARMEB__
@@ -60,12 +64,14 @@ ENTRY(__put_user_2)
 ENDPROC(__put_user_2)
 
 ENTRY(__put_user_4)
+	check_uaccess r0, 4, r1, ip, __put_user_bad
 4: TUSER(str)	r2, [r0]
 	mov	r0, #0
 	mov	pc, lr
 ENDPROC(__put_user_4)
 
 ENTRY(__put_user_8)
+	check_uaccess r0, 8, r1, ip, __put_user_bad
 #ifdef CONFIG_THUMB2_KERNEL
 5: TUSER(str)	r2, [r0]
 6: TUSER(str)	r3, [r0, #4]
diff --git a/arch/arm/lib/strncpy_from_user.S b/arch/arm/lib/strncpy_from_user.S
deleted file mode 100644
index f202d7bd164..00000000000
--- a/arch/arm/lib/strncpy_from_user.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- *  linux/arch/arm/lib/strncpy_from_user.S
- *
- *  Copyright (C) 1995-2000 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/errno.h>
-
-	.text
-	.align	5
-
-/*
- * Copy a string from user space to kernel space.
- *  r0 = dst, r1 = src, r2 = byte length
- * returns the number of characters copied (strlen of copied string),
- *  -EFAULT on exception, or "len" if we fill the whole buffer
- */
-ENTRY(__strncpy_from_user)
-	mov	ip, r1
-1:	subs	r2, r2, #1
-	ldrusr	r3, r1, 1, pl
-	bmi	2f
-	strb	r3, [r0], #1
-	teq	r3, #0
-	bne	1b
-	sub	r1, r1, #1	@ take NUL character out of count
-2:	sub	r0, r1, ip
-	mov	pc, lr
-ENDPROC(__strncpy_from_user)
-
-	.pushsection .fixup,"ax"
-	.align	0
-9001:	mov	r3, #0
-	strb	r3, [r0, #0]	@ null terminate
-	mov	r0, #-EFAULT
-	mov	pc, lr
-	.popsection
-
diff --git a/arch/arm/lib/strnlen_user.S b/arch/arm/lib/strnlen_user.S
deleted file mode 100644
index 0ecbb459c4f..00000000000
--- a/arch/arm/lib/strnlen_user.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- *  linux/arch/arm/lib/strnlen_user.S
- *
- *  Copyright (C) 1995-2000 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/errno.h>
-
-	.text
-	.align	5
-
-/* Prototype: unsigned long __strnlen_user(const char *str, long n)
- * Purpose  : get length of a string in user memory
- * Params   : str - address of string in user memory
- * Returns  : length of string *including terminator*
- *	      or zero on exception, or n + 1 if too long
- */
-ENTRY(__strnlen_user)
-	mov	r2, r0
-1:
-	ldrusr	r3, r0, 1
-	teq	r3, #0
-	beq	2f
-	subs	r1, r1, #1
-	bne	1b
-	add	r0, r0, #1
-2:	sub	r0, r0, r2
-	mov	pc, lr
-ENDPROC(__strnlen_user)
-
-	.pushsection .fixup,"ax"
-	.align	0
-9001:	mov	r0, #0
-	mov	pc, lr
-	.popsection
diff --git a/arch/arm/lib/uaccess.S b/arch/arm/lib/uaccess.S
index 5c908b1cb8e..e50520904b7 100644
--- a/arch/arm/lib/uaccess.S
+++ b/arch/arm/lib/uaccess.S
@@ -117,9 +117,9 @@ USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
 .Lc2u_1fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
 		bmi	.Lc2u_1nowords
-		mov	r3, r7, pull #8
+		mov	r3, r7, lspull #8
 		ldr	r7, [r1], #4
-		orr	r3, r3, r7, push #24
+		orr	r3, r3, r7, lspush #24
 USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
 		mov	ip, r0, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
@@ -131,30 +131,30 @@ USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
 		subs	ip, ip, #16
 		blt	.Lc2u_1rem8lp
 
-.Lc2u_1cpy8lp:	mov	r3, r7, pull #8
+.Lc2u_1cpy8lp:	mov	r3, r7, lspull #8
 		ldmia	r1!, {r4 - r7}
 		subs	ip, ip, #16
-		orr	r3, r3, r4, push #24
-		mov	r4, r4, pull #8
-		orr	r4, r4, r5, push #24
-		mov	r5, r5, pull #8
-		orr	r5, r5, r6, push #24
-		mov	r6, r6, pull #8
-		orr	r6, r6, r7, push #24
+		orr	r3, r3, r4, lspush #24
+		mov	r4, r4, lspull #8
+		orr	r4, r4, r5, lspush #24
+		mov	r5, r5, lspull #8
+		orr	r5, r5, r6, lspush #24
+		mov	r6, r6, lspull #8
+		orr	r6, r6, r7, lspush #24
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
 		bpl	.Lc2u_1cpy8lp
 
 .Lc2u_1rem8lp:	tst	ip, #8
-		movne	r3, r7, pull #8
+		movne	r3, r7, lspull #8
 		ldmneia	r1!, {r4, r7}
-		orrne	r3, r3, r4, push #24
-		movne	r4, r4, pull #8
-		orrne	r4, r4, r7, push #24
+		orrne	r3, r3, r4, lspush #24
+		movne	r4, r4, lspull #8
+		orrne	r4, r4, r7, lspush #24
 		stmneia	r0!, {r3 - r4}			@ Shouldnt fault
 		tst	ip, #4
-		movne	r3, r7, pull #8
+		movne	r3, r7, lspull #8
 		ldrne	r7, [r1], #4
-		orrne	r3, r3, r7, push #24
+		orrne	r3, r3, r7, lspush #24
 	TUSER(	strne) r3, [r0], #4			@ Shouldnt fault
 		ands	ip, ip, #3
 		beq	.Lc2u_1fupi
@@ -172,9 +172,9 @@ USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
 .Lc2u_2fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
 		bmi	.Lc2u_2nowords
-		mov	r3, r7, pull #16
+		mov	r3, r7, lspull #16
 		ldr	r7, [r1], #4
-		orr	r3, r3, r7, push #16
+		orr	r3, r3, r7, lspush #16
 USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
 		mov	ip, r0, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
@@ -186,30 +186,30 @@ USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
 		subs	ip, ip, #16
 		blt	.Lc2u_2rem8lp
 
-.Lc2u_2cpy8lp:	mov	r3, r7, pull #16
+.Lc2u_2cpy8lp:	mov	r3, r7, lspull #16
 		ldmia	r1!, {r4 - r7}
 		subs	ip, ip, #16
-		orr	r3, r3, r4, push #16
-		mov	r4, r4, pull #16
-		orr	r4, r4, r5, push #16
-		mov	r5, r5, pull #16
-		orr	r5, r5, r6, push #16
-		mov	r6, r6, pull #16
-		orr	r6, r6, r7, push #16
+		orr	r3, r3, r4, lspush #16
+		mov	r4, r4, lspull #16
+		orr	r4, r4, r5, lspush #16
+		mov	r5, r5, lspull #16
+		orr	r5, r5, r6, lspush #16
+		mov	r6, r6, lspull #16
+		orr	r6, r6, r7, lspush #16
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
 		bpl	.Lc2u_2cpy8lp
 
 .Lc2u_2rem8lp:	tst	ip, #8
-		movne	r3, r7, pull #16
+		movne	r3, r7, lspull #16
 		ldmneia	r1!, {r4, r7}
-		orrne	r3, r3, r4, push #16
-		movne	r4, r4, pull #16
-		orrne	r4, r4, r7, push #16
+		orrne	r3, r3, r4, lspush #16
+		movne	r4, r4, lspull #16
+		orrne	r4, r4, r7, lspush #16
 		stmneia	r0!, {r3 - r4}			@ Shouldnt fault
 		tst	ip, #4
-		movne	r3, r7, pull #16
+		movne	r3, r7, lspull #16
 		ldrne	r7, [r1], #4
-		orrne	r3, r3, r7, push #16
+		orrne	r3, r3, r7, lspush #16
 	TUSER(	strne) r3, [r0], #4			@ Shouldnt fault
 		ands	ip, ip, #3
 		beq	.Lc2u_2fupi
@@ -227,9 +227,9 @@ USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
 .Lc2u_3fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
 		bmi	.Lc2u_3nowords
-		mov	r3, r7, pull #24
+		mov	r3, r7, lspull #24
 		ldr	r7, [r1], #4
-		orr	r3, r3, r7, push #8
+		orr	r3, r3, r7, lspush #8
 USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
 		mov	ip, r0, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
@@ -241,30 +241,30 @@ USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
 		subs	ip, ip, #16
 		blt	.Lc2u_3rem8lp
 
-.Lc2u_3cpy8lp:	mov	r3, r7, pull #24
+.Lc2u_3cpy8lp:	mov	r3, r7, lspull #24
 		ldmia	r1!, {r4 - r7}
 		subs	ip, ip, #16
-		orr	r3, r3, r4, push #8
-		mov	r4, r4, pull #24
-		orr	r4, r4, r5, push #8
-		mov	r5, r5, pull #24
-		orr	r5, r5, r6, push #8
-		mov	r6, r6, pull #24
-		orr	r6, r6, r7, push #8
+		orr	r3, r3, r4, lspush #8
+		mov	r4, r4, lspull #24
+		orr	r4, r4, r5, lspush #8
+		mov	r5, r5, lspull #24
+		orr	r5, r5, r6, lspush #8
+		mov	r6, r6, lspull #24
+		orr	r6, r6, r7, lspush #8
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
 		bpl	.Lc2u_3cpy8lp
 
 .Lc2u_3rem8lp:	tst	ip, #8
-		movne	r3, r7, pull #24
+		movne	r3, r7, lspull #24
 		ldmneia	r1!, {r4, r7}
-		orrne	r3, r3, r4, push #8
-		movne	r4, r4, pull #24
-		orrne	r4, r4, r7, push #8
+		orrne	r3, r3, r4, lspush #8
+		movne	r4, r4, lspull #24
+		orrne	r4, r4, r7, lspush #8
 		stmneia	r0!, {r3 - r4}			@ Shouldnt fault
 		tst	ip, #4
-		movne	r3, r7, pull #24
+		movne	r3, r7, lspull #24
 		ldrne	r7, [r1], #4
-		orrne	r3, r3, r7, push #8
+		orrne	r3, r3, r7, lspush #8
 	TUSER(	strne) r3, [r0], #4			@ Shouldnt fault
 		ands	ip, ip, #3
 		beq	.Lc2u_3fupi
@@ -382,9 +382,9 @@ USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
 .Lcfu_1fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
 		bmi	.Lcfu_1nowords
-		mov	r3, r7, pull #8
+		mov	r3, r7, lspull #8
 USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
-		orr	r3, r3, r7, push #24
+		orr	r3, r3, r7, lspush #24
 		str	r3, [r0], #4
 		mov	ip, r1, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
@@ -396,30 +396,30 @@ USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
 		subs	ip, ip, #16
 		blt	.Lcfu_1rem8lp
 
-.Lcfu_1cpy8lp:	mov	r3, r7, pull #8
+.Lcfu_1cpy8lp:	mov	r3, r7, lspull #8
 		ldmia	r1!, {r4 - r7}			@ Shouldnt fault
 		subs	ip, ip, #16
-		orr	r3, r3, r4, push #24
-		mov	r4, r4, pull #8
-		orr	r4, r4, r5, push #24
-		mov	r5, r5, pull #8
-		orr	r5, r5, r6, push #24
-		mov	r6, r6, pull #8
-		orr	r6, r6, r7, push #24
+		orr	r3, r3, r4, lspush #24
+		mov	r4, r4, lspull #8
+		orr	r4, r4, r5, lspush #24
+		mov	r5, r5, lspull #8
+		orr	r5, r5, r6, lspush #24
+		mov	r6, r6, lspull #8
+		orr	r6, r6, r7, lspush #24
 		stmia	r0!, {r3 - r6}
 		bpl	.Lcfu_1cpy8lp
 
 .Lcfu_1rem8lp:	tst	ip, #8
-		movne	r3, r7, pull #8
+		movne	r3, r7, lspull #8
 		ldmneia	r1!, {r4, r7}			@ Shouldnt fault
-		orrne	r3, r3, r4, push #24
-		movne	r4, r4, pull #8
-		orrne	r4, r4, r7, push #24
+		orrne	r3, r3, r4, lspush #24
+		movne	r4, r4, lspull #8
+		orrne	r4, r4, r7, lspush #24
 		stmneia	r0!, {r3 - r4}
 		tst	ip, #4
-		movne	r3, r7, pull #8
+		movne	r3, r7, lspull #8
 USER(	TUSER(	ldrne) r7, [r1], #4)			@ May fault
-		orrne	r3, r3, r7, push #24
+		orrne	r3, r3, r7, lspush #24
 		strne	r3, [r0], #4
 		ands	ip, ip, #3
 		beq	.Lcfu_1fupi
@@ -437,9 +437,9 @@ USER(	TUSER(	ldrne) r7, [r1], #4)			@ May fault
 .Lcfu_2fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
 		bmi	.Lcfu_2nowords
-		mov	r3, r7, pull #16
+		mov	r3, r7, lspull #16
 USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
-		orr	r3, r3, r7, push #16
+		orr	r3, r3, r7, lspush #16
 		str	r3, [r0], #4
 		mov	ip, r1, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
@@ -452,30 +452,30 @@ USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
 		blt	.Lcfu_2rem8lp
 
 
-.Lcfu_2cpy8lp:	mov	r3, r7, pull #16
+.Lcfu_2cpy8lp:	mov	r3, r7, lspull #16
 		ldmia	r1!, {r4 - r7}			@ Shouldnt fault
 		subs	ip, ip, #16
-		orr	r3, r3, r4, push #16
-		mov	r4, r4, pull #16
-		orr	r4, r4, r5, push #16
-		mov	r5, r5, pull #16
-		orr	r5, r5, r6, push #16
-		mov	r6, r6, pull #16
-		orr	r6, r6, r7, push #16
+		orr	r3, r3, r4, lspush #16
+		mov	r4, r4, lspull #16
+		orr	r4, r4, r5, lspush #16
+		mov	r5, r5, lspull #16
+		orr	r5, r5, r6, lspush #16
+		mov	r6, r6, lspull #16
+		orr	r6, r6, r7, lspush #16
 		stmia	r0!, {r3 - r6}
 		bpl	.Lcfu_2cpy8lp
 
 .Lcfu_2rem8lp:	tst	ip, #8
-		movne	r3, r7, pull #16
+		movne	r3, r7, lspull #16
 		ldmneia	r1!, {r4, r7}			@ Shouldnt fault
-		orrne	r3, r3, r4, push #16
-		movne	r4, r4, pull #16
-		orrne	r4, r4, r7, push #16
+		orrne	r3, r3, r4, lspush #16
+		movne	r4, r4, lspull #16
+		orrne	r4, r4, r7, lspush #16
 		stmneia	r0!, {r3 - r4}
 		tst	ip, #4
-		movne	r3, r7, pull #16
+		movne	r3, r7, lspull #16
 USER(	TUSER(	ldrne) r7, [r1], #4)			@ May fault
-		orrne	r3, r3, r7, push #16
+		orrne	r3, r3, r7, lspush #16
 		strne	r3, [r0], #4
 		ands	ip, ip, #3
 		beq	.Lcfu_2fupi
@@ -493,9 +493,9 @@ USER(	TUSER(	ldrgtb) r3, [r1], #0)			@ May fault
 .Lcfu_3fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
 		bmi	.Lcfu_3nowords
-		mov	r3, r7, pull #24
+		mov	r3, r7, lspull #24
 USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
-		orr	r3, r3, r7, push #8
+		orr	r3, r3, r7, lspush #8
 		str	r3, [r0], #4
 		mov	ip, r1, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
@@ -507,30 +507,30 @@ USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
 		subs	ip, ip, #16
 		blt	.Lcfu_3rem8lp
 
-.Lcfu_3cpy8lp:	mov	r3, r7, pull #24
+.Lcfu_3cpy8lp:	mov	r3, r7, lspull #24
 		ldmia	r1!, {r4 - r7}			@ Shouldnt fault
-		orr	r3, r3, r4, push #8
-		mov	r4, r4, pull #24
-		orr	r4, r4, r5, push #8
-		mov	r5, r5, pull #24
-		orr	r5, r5, r6, push #8
-		mov	r6, r6, pull #24
-		orr	r6, r6, r7, push #8
+		orr	r3, r3, r4, lspush #8
+		mov	r4, r4, lspull #24
+		orr	r4, r4, r5, lspush #8
+		mov	r5, r5, lspull #24
+		orr	r5, r5, r6, lspush #8
+		mov	r6, r6, lspull #24
+		orr	r6, r6, r7, lspush #8
 		stmia	r0!, {r3 - r6}
 		subs	ip, ip, #16
 		bpl	.Lcfu_3cpy8lp
 
 .Lcfu_3rem8lp:	tst	ip, #8
-		movne	r3, r7, pull #24
+		movne	r3, r7, lspull #24
 		ldmneia	r1!, {r4, r7}			@ Shouldnt fault
-		orrne	r3, r3, r4, push #8
-		movne	r4, r4, pull #24
-		orrne	r4, r4, r7, push #8
+		orrne	r3, r3, r4, lspush #8
+		movne	r4, r4, lspull #24
+		orrne	r4, r4, r7, lspush #8
 		stmneia	r0!, {r3 - r4}
 		tst	ip, #4
-		movne	r3, r7, pull #24
+		movne	r3, r7, lspull #24
 USER(	TUSER(	ldrne) r7, [r1], #4)			@ May fault
-		orrne	r3, r3, r7, push #8
+		orrne	r3, r3, r7, lspush #8
 		strne	r3, [r0], #4
 		ands	ip, ip, #3
 		beq	.Lcfu_3fupi
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 025f742dd4d..3e58d710013 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -18,6 +18,7 @@
 #include <linux/hardirq.h> /* for in_atomic() */
 #include <linux/gfp.h>
 #include <linux/highmem.h>
+#include <linux/hugetlb.h>
 #include <asm/current.h>
 #include <asm/page.h>
 
@@ -40,7 +41,35 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
 		return 0;
 
 	pmd = pmd_offset(pud, addr);
-	if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
+	if (unlikely(pmd_none(*pmd)))
+		return 0;
+
+	/*
+	 * A pmd can be bad if it refers to a HugeTLB or THP page.
+	 *
+	 * Both THP and HugeTLB pages have the same pmd layout
+	 * and should not be manipulated by the pte functions.
+	 *
+	 * Lock the page table for the destination and check
+	 * to see that it's still huge and whether or not we will
+	 * need to fault on write, or if we have a splitting THP.
+	 */
+	if (unlikely(pmd_thp_or_huge(*pmd))) {
+		ptl = &current->mm->page_table_lock;
+		spin_lock(ptl);
+		if (unlikely(!pmd_thp_or_huge(*pmd)
+			|| pmd_hugewillfault(*pmd)
+			|| pmd_trans_splitting(*pmd))) {
+			spin_unlock(ptl);
+			return 0;
+		}
+
+		*ptep = NULL;
+		*ptlp = ptl;
+		return 1;
+	}
+
+	if (unlikely(pmd_bad(*pmd)))
 		return 0;
 
 	pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
@@ -94,7 +123,10 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
 		from += tocopy;
 		n -= tocopy;
 
-		pte_unmap_unlock(pte, ptl);
+		if (pte)
+			pte_unmap_unlock(pte, ptl);
+		else
+			spin_unlock(ptl);
 	}
 	if (!atomic)
 		up_read(&current->mm->mmap_sem);
@@ -147,7 +179,10 @@ __clear_user_memset(void __user *addr, unsigned long n)
 		addr += tocopy;
 		n -= tocopy;
 
-		pte_unmap_unlock(pte, ptl);
+		if (pte)
+			pte_unmap_unlock(pte, ptl);
+		else
+			spin_unlock(ptl);
 	}
 	up_read(&current->mm->mmap_sem);
 
diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c
new file mode 100644
index 00000000000..2c40aeab3ea
--- /dev/null
+++ b/arch/arm/lib/xor-neon.c
@@ -0,0 +1,46 @@
+/*
+ * linux/arch/arm/lib/xor-neon.c
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/raid/xor.h>
+#include <linux/module.h>
+
+MODULE_LICENSE("GPL");
+
+#ifndef __ARM_NEON__
+#error You should compile this file with '-mfloat-abi=softfp -mfpu=neon'
+#endif
+
+/*
+ * Pull in the reference implementations while instructing GCC (through
+ * -ftree-vectorize) to attempt to exploit implicit parallelism and emit
+ * NEON instructions.
+ */
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC optimize "tree-vectorize"
+#else
+/*
+ * While older versions of GCC do not generate incorrect code, they fail to
+ * recognize the parallel nature of these functions, and emit plain ARM code,
+ * which is known to be slower than the optimized ARM code in asm-arm/xor.h.
+ */
+#warning This code requires at least version 4.6 of GCC
+#endif
+
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#include <asm-generic/xor.h>
+
+struct xor_block_template const xor_block_neon_inner = {
+	.name	= "__inner_neon__",
+	.do_2	= xor_8regs_2,
+	.do_3	= xor_8regs_3,
+	.do_4	= xor_8regs_4,
+	.do_5	= xor_8regs_5,
+};
+EXPORT_SYMBOL(xor_block_neon_inner);