58 files changed, 1763 insertions, 1294 deletions
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 391f3ab3ff3..0573faab96a 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -6,28 +6,34 @@
 
 lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
 		   csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
-		   copy_page.o delay.o findbit.o memchr.o memcpy.o    \
+		   delay.o delay-loop.o findbit.o memchr.o memcpy.o   \
 		   memmove.o memset.o memzero.o setbit.o              \
-		   strncpy_from_user.o strnlen_user.o                 \
 		   strchr.o strrchr.o                                 \
 		   testchangebit.o testclearbit.o testsetbit.o        \
-		   getuser.o putuser.o clear_user.o                   \
 		   ashldi3.o ashrdi3.o lshrdi3.o muldi3.o             \
-		   ucmpdi2.o lib1funcs.o div64.o sha1.o               \
-		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o
+		   ucmpdi2.o lib1funcs.o div64.o                      \
+		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o  \
+		   call_with_stack.o bswapsdi2.o
+
+mmu-y	:= clear_user.o copy_page.o getuser.o putuser.o
 
 # the code in uaccess.S is not preemption safe and
 # probably faster on ARMv3 only
-ifeq ($CONFIG_PREEMPT,y)
-  lib-y	+= copy_from_user.o copy_to_user.o
+ifeq ($(CONFIG_PREEMPT),y)
+  mmu-y	+= copy_from_user.o copy_to_user.o
 else
 ifneq ($(CONFIG_CPU_32v3),y)
-  lib-y	+= copy_from_user.o copy_to_user.o
+  mmu-y	+= copy_from_user.o copy_to_user.o
 else
-  lib-y	+= uaccess.o
+  mmu-y	+= uaccess.o
 endif
 endif
 
+# using lib_ here won't override already available weak symbols
+obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o
+
+lib-$(CONFIG_MMU) += $(mmu-y)
+
 ifeq ($(CONFIG_CPU_32v3),y)
   lib-y	+= io-readsw-armv3.o io-writesw-armv3.o
 else
@@ -35,9 +41,12 @@ else
 endif
 
 lib-$(CONFIG_ARCH_RPC)		+= ecard.o io-acorn.o floppydma.o
-lib-$(CONFIG_ARCH_CLPS7500)	+= io-acorn.o
-lib-$(CONFIG_ARCH_L7200)	+= io-acorn.o
-lib-$(CONFIG_ARCH_SHARK)	+= io-shark.o
 
 $(obj)/csumpartialcopy.o:	$(obj)/csumpartialcopygeneric.S
 $(obj)/csumpartialcopyuser.o:	$(obj)/csumpartialcopygeneric.S
+
+ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
+  NEON_FLAGS			:= -mfloat-abi=softfp -mfpu=neon
+  CFLAGS_xor-neon.o		+= $(NEON_FLAGS)
+  obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
+endif
diff --git a/arch/arm/lib/ashldi3.S b/arch/arm/lib/ashldi3.S
index 561e20717b3..638deb13da1 100644
--- a/arch/arm/lib/ashldi3.S
+++ b/arch/arm/lib/ashldi3.S
@@ -37,12 +37,17 @@ Boston, MA 02110-1301, USA.  */
 #endif
 
 ENTRY(__ashldi3)
+ENTRY(__aeabi_llsl)
 
 	subs	r3, r2, #32
 	rsb	ip, r2, #32
 	movmi	ah, ah, lsl r2
 	movpl	ah, al, lsl r3
-	orrmi	ah, ah, al, lsr ip
+ ARM(	orrmi	ah, ah, al, lsr ip	)
+ THUMB(	lsrmi	r3, al, ip		)
+ THUMB(	orrmi	ah, ah, r3		)
 	mov	al, al, lsl r2
 	mov	pc, lr
 
+ENDPROC(__ashldi3)
+ENDPROC(__aeabi_llsl)
diff --git a/arch/arm/lib/ashrdi3.S b/arch/arm/lib/ashrdi3.S
index 86fb2a90c30..015e8aa5a1d 100644
--- a/arch/arm/lib/ashrdi3.S
+++ b/arch/arm/lib/ashrdi3.S
@@ -37,12 +37,17 @@ Boston, MA 02110-1301, USA.  */
 #endif
 
 ENTRY(__ashrdi3)
+ENTRY(__aeabi_lasr)
 
 	subs	r3, r2, #32
 	rsb	ip, r2, #32
 	movmi	al, al, lsr r2
 	movpl	al, ah, asr r3
-	orrmi	al, al, ah, lsl ip
+ ARM(	orrmi	al, al, ah, lsl ip	)
+ THUMB(	lslmi	r3, ah, ip		)
+ THUMB(	orrmi	al, al, r3		)
 	mov	ah, ah, asr r2
 	mov	pc, lr
 
+ENDPROC(__ashrdi3)
+ENDPROC(__aeabi_lasr)
diff --git a/arch/arm/lib/backtrace.S b/arch/arm/lib/backtrace.S
index 68a21c0f3f5..4102be617fc 100644
--- a/arch/arm/lib/backtrace.S
+++ b/arch/arm/lib/backtrace.S
@@ -10,7 +10,6 @@
  * 27/03/03 Ian Molton Clean up CONFIG_CPU
  *
  */
-#include <linux/config.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 		.text
@@ -18,105 +17,100 @@
 @ fp is 0 or stack frame
 
 #define frame	r4
-#define next	r5
-#define save	r6
+#define sv_fp	r5
+#define sv_pc	r6
 #define mask	r7
 #define offset	r8
 
-ENTRY(__backtrace)
-		mov	r1, #0x10
-		mov	r0, fp
-
 ENTRY(c_backtrace)
 
-#ifndef CONFIG_FRAME_POINTER
+#if !defined(CONFIG_FRAME_POINTER) || !defined(CONFIG_PRINTK)
 		mov	pc, lr
+ENDPROC(c_backtrace)
 #else
-
 		stmfd	sp!, {r4 - r8, lr}	@ Save an extra register so we have a location...
-		tst	r1, #0x10		@ 26 or 32-bit?
-		moveq	mask, #0xfc000003
-		movne	mask, #0
-		tst	mask, r0
-		movne	r0, #0
-		movs	frame, r0
-1:		moveq	r0, #-2
-		LOADREGS(eqfd, sp!, {r4 - r8, pc})
-
-2:		stmfd	sp!, {pc}		@ calculate offset of PC in STMIA instruction
-		ldr	r0, [sp], #4
-		adr	r1, 2b - 4
+		movs	frame, r0		@ if frame pointer is zero
+		beq	no_frame		@ we have no stack frames
+
+		tst	r1, #0x10		@ 26 or 32-bit mode?
+ ARM(		moveq	mask, #0xfc000003	)
+ THUMB(		moveq	mask, #0xfc000000	)
+ THUMB(		orreq	mask, #0x03		)
+		movne	mask, #0		@ mask for 32-bit
+
+1:		stmfd	sp!, {pc}		@ calculate offset of PC stored
+		ldr	r0, [sp], #4		@ by stmfd for this CPU
+		adr	r1, 1b
 		sub	offset, r0, r1
 
-3:		tst	frame, mask		@ Check for address exceptions...
-		bne	1b
+/*
+ * Stack frame layout:
+ *             optionally saved caller registers (r4 - r10)
+ *             saved fp
+ *             saved sp
+ *             saved lr
+ *    frame => saved pc
+ *             optionally saved arguments (r0 - r3)
+ * saved sp => <next word>
+ *
+ * Functions start with the following code sequence:
+ *                  mov   ip, sp
+ *                  stmfd sp!, {r0 - r3} (optional)
+ * corrected pc =>  stmfd sp!, {..., fp, ip, lr, pc}
+ */
+for_each_frame:	tst	frame, mask		@ Check for address exceptions
+		bne	no_frame
+
+1001:		ldr	sv_pc, [frame, #0]	@ get saved pc
+1002:		ldr	sv_fp, [frame, #-12]	@ get saved fp
 
-1001:		ldr	next, [frame, #-12]	@ get fp
-1002:		ldr	r2, [frame, #-4]	@ get lr
-1003:		ldr	r3, [frame, #0]		@ get pc
-		sub	save, r3, offset	@ Correct PC for prefetching
-		bic	save, save, mask
-1004:		ldr	r1, [save, #0]		@ get instruction at function
-		mov	r1, r1, lsr #10
-		ldr	r3, .Ldsi+4
-		teq	r1, r3
-		subeq	save, save, #4
-		mov	r0, save
-		bic	r1, r2, mask
+		sub	sv_pc, sv_pc, offset	@ Correct PC for prefetching
+		bic	sv_pc, sv_pc, mask	@ mask PC/LR for the mode
+
+1003:		ldr	r2, [sv_pc, #-4]	@ if stmfd sp!, {args} exists,
+		ldr	r3, .Ldsi+4		@ adjust saved 'pc' back one
+		teq	r3, r2, lsr #10		@ instruction
+		subne	r0, sv_pc, #4		@ allow for mov
+		subeq	r0, sv_pc, #8		@ allow for mov + stmia
+
+		ldr	r1, [frame, #-4]	@ get saved lr
+		mov	r2, frame
+		bic	r1, r1, mask		@ mask PC/LR for the mode
 		bl	dump_backtrace_entry
 
-		ldr	r0, [frame, #-8]	@ get sp
-		sub	r0, r0, #4
-1005:		ldr	r1, [save, #4]		@ get instruction at function+4
-		mov	r3, r1, lsr #10
-		ldr	r2, .Ldsi+4
-		teq	r3, r2			@ Check for stmia sp!, {args}
-		addeq	save, save, #4		@ next instruction
-		bleq	.Ldumpstm
-
-		sub	r0, frame, #16
-1006:		ldr	r1, [save, #4]		@ Get 'stmia sp!, {rlist, fp, ip, lr, pc}' instruction
-		mov	r3, r1, lsr #10
-		ldr	r2, .Ldsi
-		teq	r3, r2
-		bleq	.Ldumpstm
-
-		/*
-		 * A zero next framepointer means we're done.
-		 */
-		teq	next, #0
-		LOADREGS(eqfd, sp!, {r4 - r8, pc})
-
-		/*
-		 * The next framepointer must be above the
-		 * current framepointer.
-		 */
-		cmp	next, frame
-		mov	frame, next
-		bhi	3b
-		b	1007f
+		ldr	r1, [sv_pc, #-4]	@ if stmfd sp!, {args} exists,
+		ldr	r3, .Ldsi+4
+		teq	r3, r1, lsr #11
+		ldreq	r0, [frame, #-8]	@ get sp
+		subeq	r0, r0, #4		@ point at the last arg
+		bleq	.Ldumpstm		@ dump saved registers
 
-/*
- * Fixup for LDMDB
- */
-		.section .fixup,"ax"
-		.align	0
-1007:		ldr	r0, =.Lbad
+1004:		ldr	r1, [sv_pc, #0]		@ if stmfd sp!, {..., fp, ip, lr, pc}
+		ldr	r3, .Ldsi		@ instruction exists,
+		teq	r3, r1, lsr #11
+		subeq	r0, frame, #16
+		bleq	.Ldumpstm		@ dump saved registers
+
+		teq	sv_fp, #0		@ zero saved fp means
+		beq	no_frame		@ no further frames
+
+		cmp	sv_fp, frame		@ next frame must be
+		mov	frame, sv_fp		@ above the current frame
+		bhi	for_each_frame
+
+1006:		adr	r0, .Lbad
 		mov	r1, frame
 		bl	printk
-		LOADREGS(fd, sp!, {r4 - r8, pc})
-		.ltorg
-		.previous
+no_frame:	ldmfd	sp!, {r4 - r8, pc}
+ENDPROC(c_backtrace)
 		
-		.section __ex_table,"a"
+		.pushsection __ex_table,"a"
 		.align	3
-		.long	1001b, 1007b
-		.long	1002b, 1007b
-		.long	1003b, 1007b
-		.long	1004b, 1007b
-		.long	1005b, 1007b
-		.long	1006b, 1007b
-		.previous
+		.long	1001b, 1006b
+		.long	1002b, 1006b
+		.long	1003b, 1006b
+		.long	1004b, 1006b
+		.popsection
 
 #define instr r4
 #define reg   r5
@@ -125,16 +119,18 @@ ENTRY(c_backtrace)
 .Ldumpstm:	stmfd	sp!, {instr, reg, stack, r7, lr}
 		mov	stack, r0
 		mov	instr, r1
-		mov	reg, #9
+		mov	reg, #10
 		mov	r7, #0
 1:		mov	r3, #1
-		tst	instr, r3, lsl reg
+ ARM(		tst	instr, r3, lsl reg	)
+ THUMB(		lsl	r3, reg			)
+ THUMB(		tst	instr, r3		)
 		beq	2f
 		add	r7, r7, #1
-		teq	r7, #4
+		teq	r7, #6
 		moveq	r7, #0
-		moveq	r3, #'\n'
-		movne	r3, #' '
+		adr	r3, .Lcr
+		addne	r3, r3, #1		@ skip newline
 		ldr	r2, [stack], #-4
 		mov	r1, reg
 		adr	r0, .Lfp
@@ -144,14 +140,13 @@ ENTRY(c_backtrace)
 		teq	r7, #0
 		adrne	r0, .Lcr
 		blne	printk
-		mov	r0, stack
-		LOADREGS(fd, sp!, {instr, reg, stack, r7, pc})
+		ldmfd	sp!, {instr, reg, stack, r7, pc}
 
-.Lfp:		.asciz	" r%d = %08X%c"
+.Lfp:		.asciz	" r%d:%08x%s"
 .Lcr:		.asciz	"\n"
 .Lbad:		.asciz	"Backtrace aborted due to bad frame pointer <%p>\n"
 		.align
-.Ldsi:		.word	0x00e92dd8 >> 2
-		.word	0x00e92d00 >> 2
+.Ldsi:		.word	0xe92dd800 >> 11	@ stmfd sp!, {... fp, ip, lr, pc}
+		.word	0xe92d0000 >> 11	@ stmfd sp!, {}
 
 #endif
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h
index f35d91fbe11..9f12ed1eea8 100644
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -1,45 +1,78 @@
-#include <linux/config.h>
+#include <asm/unwind.h>
 
-#if __LINUX_ARM_ARCH__ >= 6 && defined(CONFIG_CPU_32v6K)
-	.macro	bitop, instr
+#if __LINUX_ARM_ARCH__ >= 6
+	.macro	bitop, name, instr
+ENTRY(	\name		)
+UNWIND(	.fnstart	)
+	ands	ip, r1, #3
+	strneb	r1, [ip]		@ assert word-aligned
 	mov	r2, #1
-	and	r3, r0, #7		@ Get bit offset
-	add	r1, r1, r0, lsr #3	@ Get byte offset
+	and	r3, r0, #31		@ Get bit offset
+	mov	r0, r0, lsr #5
+	add	r1, r1, r0, lsl #2	@ Get word offset
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
+	.arch_extension	mp
+	ALT_SMP(W(pldw)	[r1])
+	ALT_UP(W(nop))
+#endif
 	mov	r3, r2, lsl r3
-1:	ldrexb	r2, [r1]
+1:	ldrex	r2, [r1]
 	\instr	r2, r2, r3
-	strexb	r0, r2, [r1]
+	strex	r0, r2, [r1]
 	cmp	r0, #0
 	bne	1b
-	mov	pc, lr
+	bx	lr
+UNWIND(	.fnend		)
+ENDPROC(\name		)
 	.endm
 
-	.macro	testop, instr, store
-	and	r3, r0, #7		@ Get bit offset
+	.macro	testop, name, instr, store
+ENTRY(	\name		)
+UNWIND(	.fnstart	)
+	ands	ip, r1, #3
+	strneb	r1, [ip]		@ assert word-aligned
 	mov	r2, #1
-	add	r1, r1, r0, lsr #3	@ Get byte offset
+	and	r3, r0, #31		@ Get bit offset
+	mov	r0, r0, lsr #5
+	add	r1, r1, r0, lsl #2	@ Get word offset
 	mov	r3, r2, lsl r3		@ create mask
-1:	ldrexb	r2, [r1]
+	smp_dmb
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
+	.arch_extension	mp
+	ALT_SMP(W(pldw)	[r1])
+	ALT_UP(W(nop))
+#endif
+1:	ldrex	r2, [r1]
 	ands	r0, r2, r3		@ save old value of bit
-	\instr	r2, r2, r3			@ toggle bit
-	strexb	ip, r2, [r1]
+	\instr	r2, r2, r3		@ toggle bit
+	strex	ip, r2, [r1]
 	cmp	ip, #0
 	bne	1b
+	smp_dmb
 	cmp	r0, #0
 	movne	r0, #1
-2:	mov	pc, lr
+2:	bx	lr
+UNWIND(	.fnend		)
+ENDPROC(\name		)
 	.endm
 #else
-	.macro	bitop, instr
-	and	r2, r0, #7
+	.macro	bitop, name, instr
+ENTRY(	\name		)
+UNWIND(	.fnstart	)
+	ands	ip, r1, #3
+	strneb	r1, [ip]		@ assert word-aligned
+	and	r2, r0, #31
+	mov	r0, r0, lsr #5
 	mov	r3, #1
 	mov	r3, r3, lsl r2
-	save_and_disable_irqs ip, r2
-	ldrb	r2, [r1, r0, lsr #3]
+	save_and_disable_irqs ip
+	ldr	r2, [r1, r0, lsl #2]
 	\instr	r2, r2, r3
-	strb	r2, [r1, r0, lsr #3]
+	str	r2, [r1, r0, lsl #2]
 	restore_irqs ip
 	mov	pc, lr
+UNWIND(	.fnend		)
+ENDPROC(\name		)
 	.endm
 
 /**
@@ -48,19 +81,25 @@
  * @store: store instruction
  *
  * Note: we can trivially conditionalise the store instruction
- * to avoid dirting the data cache.
+ * to avoid dirtying the data cache.
  */
-	.macro	testop, instr, store
-	add	r1, r1, r0, lsr #3
-	and	r3, r0, #7
+	.macro	testop, name, instr, store
+ENTRY(	\name		)
+UNWIND(	.fnstart	)
+	ands	ip, r1, #3
+	strneb	r1, [ip]		@ assert word-aligned
+	and	r3, r0, #31
+	mov	r0, r0, lsr #5
+	save_and_disable_irqs ip
+	ldr	r2, [r1, r0, lsl #2]!
 	mov	r0, #1
-	save_and_disable_irqs ip, r2
-	ldrb	r2, [r1]
 	tst	r2, r0, lsl r3
 	\instr	r2, r2, r0, lsl r3
 	\store	r2, [r1]
-	restore_irqs ip
 	moveq	r0, #0
+	restore_irqs ip
 	mov	pc, lr
+UNWIND(	.fnend		)
+ENDPROC(\name		)
 	.endm
 #endif
diff --git a/arch/arm/lib/bswapsdi2.S b/arch/arm/lib/bswapsdi2.S
new file mode 100644
index 00000000000..9fcdd154eff
--- /dev/null
+++ b/arch/arm/lib/bswapsdi2.S
@@ -0,0 +1,36 @@
+#include <linux/linkage.h>
+
+#if __LINUX_ARM_ARCH__ >= 6
+ENTRY(__bswapsi2)
+	rev r0, r0
+	bx lr
+ENDPROC(__bswapsi2)
+
+ENTRY(__bswapdi2)
+	rev r3, r0
+	rev r0, r1
+	mov r1, r3
+	bx lr
+ENDPROC(__bswapdi2)
+#else
+ENTRY(__bswapsi2)
+	eor r3, r0, r0, ror #16
+	mov r3, r3, lsr #8
+	bic r3, r3, #0xff00
+	eor r0, r3, r0, ror #8
+	mov pc, lr
+ENDPROC(__bswapsi2)
+
+ENTRY(__bswapdi2)
+	mov ip, r1
+	eor r3, ip, ip, ror #16
+	eor r1, r0, r0, ror #16
+	mov r1, r1, lsr #8
+	mov r3, r3, lsr #8
+	bic r3, r3, #0xff00
+	bic r1, r1, #0xff00
+	eor r1, r1, r0, ror #8
+	eor r0, r3, ip, ror #8
+	mov pc, lr
+ENDPROC(__bswapdi2)
+#endif
diff --git a/arch/arm/lib/call_with_stack.S b/arch/arm/lib/call_with_stack.S
new file mode 100644
index 00000000000..916c80f13ae
--- /dev/null
+++ b/arch/arm/lib/call_with_stack.S
@@ -0,0 +1,44 @@
+/*
+ * arch/arm/lib/call_with_stack.S
+ *
+ * Copyright (C) 2011 ARM Ltd.
+ * Written by Will Deacon <will.deacon@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * void call_with_stack(void (*fn)(void *), void *arg, void *sp)
+ *
+ * Change the stack to that pointed at by sp, then invoke fn(arg) with
+ * the new stack.
+ */
+ENTRY(call_with_stack)
+	str	sp, [r2, #-4]!
+	str	lr, [r2, #-4]!
+
+	mov	sp, r2
+	mov	r2, r0
+	mov	r0, r1
+
+	adr	lr, BSYM(1f)
+	mov	pc, r2
+
+1:	ldr	lr, [sp]
+	ldr	sp, [sp, #4]
+	mov	pc, lr
+ENDPROC(call_with_stack)
diff --git a/arch/arm/lib/changebit.S b/arch/arm/lib/changebit.S
index 389567c2409..f4027862172 100644
--- a/arch/arm/lib/changebit.S
+++ b/arch/arm/lib/changebit.S
@@ -12,10 +12,4 @@
 #include "bitops.h"
                 .text
 
-/* Purpose  : Function to change a bit
- * Prototype: int change_bit(int bit, void *addr)
- */
-ENTRY(_change_bit_be)
-		eor	r0, r0, #0x18		@ big endian byte ordering
-ENTRY(_change_bit_le)
-	bitop	eor
+bitop	_change_bit, eor
diff --git a/arch/arm/lib/clear_user.S b/arch/arm/lib/clear_user.S
index 7ff9f831b3f..14a0d988c82 100644
--- a/arch/arm/lib/clear_user.S
+++ b/arch/arm/lib/clear_user.S
@@ -12,13 +12,14 @@
 
 		.text
 
-/* Prototype: int __arch_clear_user(void *addr, size_t sz)
+/* Prototype: int __clear_user(void *addr, size_t sz)
  * Purpose  : clear some user memory
  * Params   : addr - user memory address to clear
  *          : sz   - number of bytes to clear
  * Returns  : number of bytes NOT cleared
  */
-ENTRY(__arch_clear_user)
+ENTRY(__clear_user_std)
+WEAK(__clear_user)
 		stmfd	sp!, {r1, lr}
 		mov	r2, #0
 		cmp	r1, #4
@@ -26,27 +27,28 @@ ENTRY(__arch_clear_user)
 		ands	ip, r0, #3
 		beq	1f
 		cmp	ip, #2
-USER(		strbt	r2, [r0], #1)
-USER(		strlebt	r2, [r0], #1)
-USER(		strltbt	r2, [r0], #1)
+		strusr	r2, r0, 1
+		strusr	r2, r0, 1, le
+		strusr	r2, r0, 1, lt
 		rsb	ip, ip, #4
 		sub	r1, r1, ip		@  7  6  5  4  3  2  1
 1:		subs	r1, r1, #8		@ -1 -2 -3 -4 -5 -6 -7
-USER(		strplt	r2, [r0], #4)
-USER(		strplt	r2, [r0], #4)
+		strusr	r2, r0, 4, pl, rept=2
 		bpl	1b
 		adds	r1, r1, #4		@  3  2  1  0 -1 -2 -3
-USER(		strplt	r2, [r0], #4)
+		strusr	r2, r0, 4, pl
 2:		tst	r1, #2			@ 1x 1x 0x 0x 1x 1x 0x
-USER(		strnebt	r2, [r0], #1)
-USER(		strnebt	r2, [r0], #1)
+		strusr	r2, r0, 1, ne, rept=2
 		tst	r1, #1			@ x1 x0 x1 x0 x1 x0 x1
-USER(		strnebt	r2, [r0], #1)
+		it	ne			@ explicit IT needed for the label
+USER(		strnebt	r2, [r0])
 		mov	r0, #0
-		LOADREGS(fd,sp!, {r1, pc})
+		ldmfd	sp!, {r1, pc}
+ENDPROC(__clear_user)
+ENDPROC(__clear_user_std)
 
-		.section .fixup,"ax"
+		.pushsection .fixup,"ax"
 		.align	0
-9001:		LOADREGS(fd,sp!, {r0, pc})
-		.previous
+9001:		ldmfd	sp!, {r0, pc}
+		.popsection
 
diff --git a/arch/arm/lib/clearbit.S b/arch/arm/lib/clearbit.S
index 34751653302..f6b75fb64d3 100644
--- a/arch/arm/lib/clearbit.S
+++ b/arch/arm/lib/clearbit.S
@@ -12,11 +12,4 @@
 #include "bitops.h"
                 .text
 
-/*
- * Purpose  : Function to clear a bit
- * Prototype: int clear_bit(int bit, void *addr)
- */
-ENTRY(_clear_bit_be)
-		eor	r0, r0, #0x18		@ big endian byte ordering
-ENTRY(_clear_bit_le)
-	bitop	bic
+bitop	_clear_bit, bic
diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index 7497393a0e8..66a477a3e3c 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -16,7 +16,7 @@
 /*
  * Prototype:
  *
- *	size_t __arch_copy_from_user(void *to, const void *from, size_t n)
+ *	size_t __copy_from_user(void *to, const void *from, size_t n)
  *
  * Purpose:
  *
@@ -33,11 +33,15 @@
  *	Number of bytes NOT copied.
  */
 
+#ifndef CONFIG_THUMB2_KERNEL
+#define LDR1W_SHIFT	0
+#else
+#define LDR1W_SHIFT	1
+#endif
+#define STR1W_SHIFT	0
+
 	.macro ldr1w ptr reg abort
-100:	ldrt \reg, [\ptr], #4
-	.section __ex_table, "a"
-	.long 100b, \abort
-	.previous
+	ldrusr	\reg, \ptr, 4, abort=\abort
 	.endm
 
 	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
@@ -53,14 +57,11 @@
 	.endm
 
 	.macro ldr1b ptr reg cond=al abort
-100:	ldr\cond\()bt \reg, [\ptr], #1
-	.section __ex_table, "a"
-	.long 100b, \abort
-	.previous
+	ldrusr	\reg, \ptr, 1, \cond, abort=\abort
 	.endm
 
 	.macro str1w ptr reg abort
-	str \reg, [\ptr], #4
+	W(str) \reg, [\ptr], #4
 	.endm
 
 	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
@@ -83,11 +84,13 @@
 
 	.text
 
-ENTRY(__arch_copy_from_user)
+ENTRY(__copy_from_user)
 
 #include "copy_template.S"
 
-	.section .fixup,"ax"
+ENDPROC(__copy_from_user)
+
+	.pushsection .fixup,"ax"
 	.align 0
 	copy_abort_preamble
 	ldmfd	sp!, {r1, r2}
@@ -97,5 +100,5 @@ ENTRY(__arch_copy_from_user)
 	bl	__memzero
 	ldr	r0, [sp], #4
 	copy_abort_end
-	.previous
+	.popsection
 
diff --git a/arch/arm/lib/copy_page.S b/arch/arm/lib/copy_page.S
index 68117968482..6ee2f6706f8 100644
--- a/arch/arm/lib/copy_page.S
+++ b/arch/arm/lib/copy_page.S
@@ -12,8 +12,9 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/asm-offsets.h>
+#include <asm/cache.h>
 
-#define COPY_COUNT (PAGE_SZ/64 PLD( -1 ))
+#define COPY_COUNT (PAGE_SZ / (2 * L1_CACHE_BYTES) PLD( -1 ))
 
 		.text
 		.align	5
@@ -26,21 +27,21 @@
 ENTRY(copy_page)
 		stmfd	sp!, {r4, lr}			@	2
 	PLD(	pld	[r1, #0]		)
-	PLD(	pld	[r1, #32]		)
+	PLD(	pld	[r1, #L1_CACHE_BYTES]		)
 		mov	r2, #COPY_COUNT			@	1
 		ldmia	r1!, {r3, r4, ip, lr}		@	4+1
-1:	PLD(	pld	[r1, #64]		)
-	PLD(	pld	[r1, #96]		)
-2:		stmia	r0!, {r3, r4, ip, lr}		@	4
-		ldmia	r1!, {r3, r4, ip, lr}		@	4+1
-		stmia	r0!, {r3, r4, ip, lr}		@	4
-		ldmia	r1!, {r3, r4, ip, lr}		@	4+1
+1:	PLD(	pld	[r1, #2 * L1_CACHE_BYTES])
+	PLD(	pld	[r1, #3 * L1_CACHE_BYTES])
+2:
+	.rept	(2 * L1_CACHE_BYTES / 16 - 1)
 		stmia	r0!, {r3, r4, ip, lr}		@	4
 		ldmia	r1!, {r3, r4, ip, lr}		@	4
+	.endr
 		subs	r2, r2, #1			@	1
 		stmia	r0!, {r3, r4, ip, lr}		@	4
 		ldmgtia	r1!, {r3, r4, ip, lr}		@	4
 		bgt	1b				@	1
 	PLD(	ldmeqia r1!, {r3, r4, ip, lr}	)
 	PLD(	beq	2b			)
-		LOADREGS(fd, sp!, {r4, pc})		@	3
+		ldmfd	sp!, {r4, pc}			@	3
+ENDPROC(copy_page)
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 838e435e492..3bc8eb811a7 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -13,14 +13,6 @@
  */
 
 /*
- * This can be used to enable code to cacheline align the source pointer.
- * Experiments on tested architectures (StrongARM and XScale) didn't show
- * this a worthwhile thing to do.  That might be different in the future.
- */
-//#define CALGN(code...)	code
-#define CALGN(code...)
-
-/*
  * Theory of operation
  * -------------------
  *
@@ -65,6 +57,13 @@
  *
  *	Restore registers with the values previously saved with the
  *	'preserv' macro. Called upon code termination.
+ *
+ * LDR1W_SHIFT
+ * STR1W_SHIFT
+ *
+ *	Correction to be applied to the "ip" register when branching into
+ *	the ldr1w or str1w instructions (some of these macros may expand to
+ *	than one 32bit instruction in Thumb-2)
  */
 
 
@@ -82,7 +81,7 @@
 		stmfd	sp!, {r5 - r8}
 		blt	5f
 
-	CALGN(	ands	ip, r1, #31		)
+	CALGN(	ands	ip, r0, #31		)
 	CALGN(	rsb	r3, ip, #32		)
 	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
 	CALGN(	bcs	2f			)
@@ -107,9 +106,15 @@
 
 5:		ands	ip, r2, #28
 		rsb	ip, ip, #32
+#if LDR1W_SHIFT > 0
+		lsl	ip, ip, #LDR1W_SHIFT
+#endif
 		addne	pc, pc, ip		@ C is always clear here
 		b	7f
-6:		nop
+6:
+		.rept	(1 << LDR1W_SHIFT)
+		W(nop)
+		.endr
 		ldr1w	r1, r3, abort=20f
 		ldr1w	r1, r4, abort=20f
 		ldr1w	r1, r5, abort=20f
@@ -118,9 +123,16 @@
 		ldr1w	r1, r8, abort=20f
 		ldr1w	r1, lr, abort=20f
 
+#if LDR1W_SHIFT < STR1W_SHIFT
+		lsl	ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
+#elif LDR1W_SHIFT > STR1W_SHIFT
+		lsr	ip, ip, #LDR1W_SHIFT - STR1W_SHIFT
+#endif
 		add	pc, pc, ip
 		nop
-		nop
+		.rept	(1 << STR1W_SHIFT)
+		W(nop)
+		.endr
 		str1w	r0, r3, abort=20f
 		str1w	r0, r4, abort=20f
 		str1w	r0, r5, abort=20f
@@ -168,7 +180,7 @@
 		subs	r2, r2, #28
 		blt	14f
 
-	CALGN(	ands	ip, r1, #31		)
+	CALGN(	ands	ip, r0, #31		)
 	CALGN(	rsb	ip, ip, #32		)
 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
 	CALGN(	subcc	r2, r2, ip		)
@@ -185,24 +197,24 @@
 
 12:	PLD(	pld	[r1, #124]		)
 13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
-		mov	r3, lr, pull #\pull
+		mov	r3, lr, lspull #\pull
 		subs	r2, r2, #32
 		ldr4w	r1, r8, r9, ip, lr, abort=19f
-		orr	r3, r3, r4, push #\push
-		mov	r4, r4, pull #\pull
-		orr	r4, r4, r5, push #\push
-		mov	r5, r5, pull #\pull
-		orr	r5, r5, r6, push #\push
-		mov	r6, r6, pull #\pull
-		orr	r6, r6, r7, push #\push
-		mov	r7, r7, pull #\pull
-		orr	r7, r7, r8, push #\push
-		mov	r8, r8, pull #\pull
-		orr	r8, r8, r9, push #\push
-		mov	r9, r9, pull #\pull
-		orr	r9, r9, ip, push #\push
-		mov	ip, ip, pull #\pull
-		orr	ip, ip, lr, push #\push
+		orr	r3, r3, r4, lspush #\push
+		mov	r4, r4, lspull #\pull
+		orr	r4, r4, r5, lspush #\push
+		mov	r5, r5, lspull #\pull
+		orr	r5, r5, r6, lspush #\push
+		mov	r6, r6, lspull #\pull
+		orr	r6, r6, r7, lspush #\push
+		mov	r7, r7, lspull #\pull
+		orr	r7, r7, r8, lspush #\push
+		mov	r8, r8, lspull #\pull
+		orr	r8, r8, r9, lspush #\push
+		mov	r9, r9, lspull #\pull
+		orr	r9, r9, ip, lspush #\push
+		mov	ip, ip, lspull #\pull
+		orr	ip, ip, lr, lspush #\push
 		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
 		bge	12b
 	PLD(	cmn	r2, #96			)
@@ -213,10 +225,10 @@
 14:		ands	ip, r2, #28
 		beq	16f
 
-15:		mov	r3, lr, pull #\pull
+15:		mov	r3, lr, lspull #\pull
 		ldr1w	r1, lr, abort=21f
 		subs	ip, ip, #4
-		orr	r3, r3, lr, push #\push
+		orr	r3, r3, lr, lspush #\push
 		str1w	r0, r3, abort=21f
 		bgt	15b
 	CALGN(	cmp	r2, #0			)
@@ -236,7 +248,7 @@
 
 
 /*
- * Abort preanble and completion macros.
+ * Abort preamble and completion macros.
  * If a fixup handler is required then those macros must surround it.
  * It is assumed that the fixup code will handle the private part of
  * the exit macro.
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index 4a6d8ea1402..d066df686e1 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -16,7 +16,7 @@
 /*
  * Prototype:
  *
- *	size_t __arch_copy_to_user(void *to, const void *from, size_t n)
+ *	size_t __copy_to_user(void *to, const void *from, size_t n)
  *
  * Purpose:
  *
@@ -33,8 +33,15 @@
  *	Number of bytes NOT copied.
  */
 
+#define LDR1W_SHIFT	0
+#ifndef CONFIG_THUMB2_KERNEL
+#define STR1W_SHIFT	0
+#else
+#define STR1W_SHIFT	1
+#endif
+
 	.macro ldr1w ptr reg abort
-	ldr \reg, [\ptr], #4
+	W(ldr) \reg, [\ptr], #4
 	.endm
 
 	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
@@ -50,10 +57,7 @@
 	.endm
 
 	.macro str1w ptr reg abort
-100:	strt \reg, [\ptr], #4
-	.section __ex_table, "a"
-	.long 100b, \abort
-	.previous
+	strusr	\reg, \ptr, 4, abort=\abort
 	.endm
 
 	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
@@ -68,10 +72,7 @@
 	.endm
 
 	.macro str1b ptr reg cond=al abort
-100:	str\cond\()bt \reg, [\ptr], #1
-	.section __ex_table, "a"
-	.long 100b, \abort
-	.previous
+	strusr	\reg, \ptr, 1, \cond, abort=\abort
 	.endm
 
 	.macro enter reg1 reg2
@@ -86,16 +87,20 @@
 
 	.text
 
-ENTRY(__arch_copy_to_user)
+ENTRY(__copy_to_user_std)
+WEAK(__copy_to_user)
 
 #include "copy_template.S"
 
-	.section .fixup,"ax"
+ENDPROC(__copy_to_user)
+ENDPROC(__copy_to_user_std)
+
+	.pushsection .fixup,"ax"
 	.align 0
 	copy_abort_preamble
 	ldmfd	sp!, {r1, r2, r3}
 	sub	r0, r0, r1
 	rsb	r0, r0, r2
 	copy_abort_end
-	.previous
+	.popsection
 
diff --git a/arch/arm/lib/csumipv6.S b/arch/arm/lib/csumipv6.S
index 7065a20ee8a..3ac6ef01bc4 100644
--- a/arch/arm/lib/csumipv6.S
+++ b/arch/arm/lib/csumipv6.S
@@ -28,5 +28,6 @@ ENTRY(__csum_ipv6_magic)
 		adcs	r0, r0, r3
 		adcs	r0, r0, r2
 		adcs	r0, r0, #0
-		LOADREGS(fd, sp!, {pc})
+		ldmfd	sp!, {pc}
+ENDPROC(__csum_ipv6_magic)
 
diff --git a/arch/arm/lib/csumpartial.S b/arch/arm/lib/csumpartial.S
index cb5e3708f11..31d3cb34740 100644
--- a/arch/arm/lib/csumpartial.S
+++ b/arch/arm/lib/csumpartial.S
@@ -26,7 +26,7 @@ td1	.req	r4	@ save before use
 td2	.req	r5	@ save before use
 td3	.req	lr
 
-.zero:		mov	r0, sum
+.Lzero:		mov	r0, sum
 		add	sp, sp, #4
 		ldr	pc, [sp], #4
 
@@ -34,21 +34,22 @@ td3	.req	lr
 		 * Handle 0 to 7 bytes, with any alignment of source and
 		 * destination pointers.  Note that when we get here, C = 0
 		 */
-.less8:		teq	len, #0			@ check for zero count
-		beq	.zero
+.Lless8:		teq	len, #0			@ check for zero count
+		beq	.Lzero
 
 		/* we must have at least one byte. */
 		tst	buf, #1			@ odd address?
+		movne	sum, sum, ror #8
 		ldrneb	td0, [buf], #1
 		subne	len, len, #1
 		adcnes	sum, sum, td0, put_byte_1
 
-.less4:		tst	len, #6
-		beq	.less8_byte
+.Lless4:		tst	len, #6
+		beq	.Lless8_byte
 
 		/* we are now half-word aligned */
 
-.less8_wordlp:
+.Lless8_wordlp:
 #if __LINUX_ARM_ARCH__ >= 4
 		ldrh	td0, [buf], #2
 		sub	len, len, #2
@@ -64,19 +65,19 @@ td3	.req	lr
 #endif
 		adcs	sum, sum, td0
 		tst	len, #6
-		bne	.less8_wordlp
+		bne	.Lless8_wordlp
 
-.less8_byte:	tst	len, #1			@ odd number of bytes
+.Lless8_byte:	tst	len, #1			@ odd number of bytes
 		ldrneb	td0, [buf], #1		@ include last byte
 		adcnes	sum, sum, td0, put_byte_0	@ update checksum
 
-.done:		adc	r0, sum, #0		@ collect up the last carry
+.Ldone:		adc	r0, sum, #0		@ collect up the last carry
 		ldr	td0, [sp], #4
 		tst	td0, #1			@ check buffer alignment
 		movne	r0, r0, ror #8		@ rotate checksum by 8 bits
 		ldr	pc, [sp], #4		@ return
 
-.not_aligned:	tst	buf, #1			@ odd address
+.Lnot_aligned:	tst	buf, #1			@ odd address
 		ldrneb	td0, [buf], #1		@ make even
 		subne	len, len, #1
 		adcnes	sum, sum, td0, put_byte_1	@ update checksum
@@ -101,11 +102,14 @@ td3	.req	lr
 ENTRY(csum_partial)
 		stmfd	sp!, {buf, lr}
 		cmp	len, #8			@ Ensure that we have at least
-		blo	.less8			@ 8 bytes to copy.
+		blo	.Lless8			@ 8 bytes to copy.
+
+		tst	buf, #1
+		movne	sum, sum, ror #8
 
 		adds	sum, sum, #0		@ C = 0
 		tst	buf, #3			@ Test destination alignment
-		blne	.not_aligned		@ aligh destination, return here
+		blne	.Lnot_aligned		@ align destination, return here
 
 1:		bics	ip, len, #31
 		beq	3f
@@ -127,11 +131,12 @@ ENTRY(csum_partial)
 		ldmfd	sp!, {r4 - r5}
 
 3:		tst	len, #0x1c		@ should not change C
-		beq	.less4
+		beq	.Lless4
 
 4:		ldr	td0, [buf], #4
 		sub	len, len, #4
 		adcs	sum, sum, td0
 		tst	len, #0x1c
 		bne	4b
-		b	.less4
+		b	.Lless4
+ENDPROC(csum_partial)
diff --git a/arch/arm/lib/csumpartialcopy.S b/arch/arm/lib/csumpartialcopy.S
index 990ee63b246..d03fc71fc88 100644
--- a/arch/arm/lib/csumpartialcopy.S
+++ b/arch/arm/lib/csumpartialcopy.S
@@ -18,11 +18,11 @@
  */
 
 		.macro	save_regs
-		stmfd	sp!, {r1, r4 - r8, fp, ip, lr, pc}
+		stmfd	sp!, {r1, r4 - r8, lr}
 		.endm
 
-		.macro	load_regs,flags
-		LOADREGS(\flags,fp,{r1, r4 - r8, fp, sp, pc})
+		.macro	load_regs
+		ldmfd	sp!, {r1, r4 - r8, pc}
 		.endm
 
 		.macro	load1b, reg1
@@ -48,5 +48,6 @@
 		.endm
 
 #define FN_ENTRY	ENTRY(csum_partial_copy_nocheck)
+#define FN_EXIT		ENDPROC(csum_partial_copy_nocheck)
 
 #include "csumpartialcopygeneric.S"
diff --git a/arch/arm/lib/csumpartialcopygeneric.S b/arch/arm/lib/csumpartialcopygeneric.S
index d3a2f4667db..d6e742d2400 100644
--- a/arch/arm/lib/csumpartialcopygeneric.S
+++ b/arch/arm/lib/csumpartialcopygeneric.S
@@ -22,8 +22,8 @@ dst	.req	r1
 len	.req	r2
 sum	.req	r3
 
-.zero:		mov	r0, sum
-		load_regs	ea
+.Lzero:		mov	r0, sum
+		load_regs
 
 		/*
 		 * Align an unaligned destination pointer.  We know that
@@ -31,8 +31,9 @@ sum	.req	r3
 		 * the length.  Note that the source pointer hasn't been
 		 * aligned yet.
 		 */
-.dst_unaligned:	tst	dst, #1
-		beq	.dst_16bit
+.Ldst_unaligned:
+		tst	dst, #1
+		beq	.Ldst_16bit
 
 		load1b	ip
 		sub	len, len, #1
@@ -41,7 +42,7 @@ sum	.req	r3
 		tst	dst, #2
 		moveq	pc, lr			@ dst is now 32bit aligned
 
-.dst_16bit:	load2b	r8, ip
+.Ldst_16bit:	load2b	r8, ip
 		sub	len, len, #2
 		adcs	sum, sum, r8, put_byte_0
 		strb	r8, [dst], #1
@@ -53,12 +54,12 @@ sum	.req	r3
 		 * Handle 0 to 7 bytes, with any alignment of source and
 		 * destination pointers.  Note that when we get here, C = 0
 		 */
-.less8:		teq	len, #0			@ check for zero count
-		beq	.zero
+.Lless8:	teq	len, #0			@ check for zero count
+		beq	.Lzero
 
 		/* we must have at least one byte. */
 		tst	dst, #1			@ dst 16-bit aligned
-		beq	.less8_aligned
+		beq	.Lless8_aligned
 
 		/* Align dst */
 		load1b	ip
@@ -66,7 +67,7 @@ sum	.req	r3
 		adcs	sum, sum, ip, put_byte_1	@ update checksum
 		strb	ip, [dst], #1
 		tst	len, #6
-		beq	.less8_byteonly
+		beq	.Lless8_byteonly
 
 1:		load2b	r8, ip
 		sub	len, len, #2
@@ -74,27 +75,26 @@ sum	.req	r3
 		strb	r8, [dst], #1
 		adcs	sum, sum, ip, put_byte_1
 		strb	ip, [dst], #1
-.less8_aligned:	tst	len, #6
+.Lless8_aligned:
+		tst	len, #6
 		bne	1b
-.less8_byteonly:
+.Lless8_byteonly:
 		tst	len, #1
-		beq	.done
+		beq	.Ldone
 		load1b	r8
 		adcs	sum, sum, r8, put_byte_0	@ update checksum
 		strb	r8, [dst], #1
-		b	.done
+		b	.Ldone
 
 FN_ENTRY
-		mov	ip, sp
 		save_regs
-		sub	fp, ip, #4
 
 		cmp	len, #8			@ Ensure that we have at least
-		blo	.less8			@ 8 bytes to copy.
+		blo	.Lless8			@ 8 bytes to copy.
 
 		adds	sum, sum, #0		@ C = 0
 		tst	dst, #3			@ Test destination alignment
-		blne	.dst_unaligned		@ align destination, return here
+		blne	.Ldst_unaligned		@ align destination, return here
 
 		/*
 		 * Ok, the dst pointer is now 32bit aligned, and we know
@@ -103,7 +103,7 @@ FN_ENTRY
 		 */
 
 		tst	src, #3			@ Test source alignment
-		bne	.src_not_aligned
+		bne	.Lsrc_not_aligned
 
 		/* Routine for src & dst aligned */
 
@@ -136,17 +136,17 @@ FN_ENTRY
 		adcs	sum, sum, r4
 
 4:		ands	len, len, #3
-		beq	.done
+		beq	.Ldone
 		load1l	r4
 		tst	len, #2
 		mov	r5, r4, get_byte_0
-		beq	.exit
-		adcs	sum, sum, r4, push #16
+		beq	.Lexit
+		adcs	sum, sum, r4, lspush #16
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_1
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_2
-.exit:		tst	len, #1
+.Lexit:		tst	len, #1
 		strneb	r5, [dst], #1
 		andne	r5, r5, #255
 		adcnes	sum, sum, r5, put_byte_0
@@ -157,37 +157,37 @@ FN_ENTRY
 		 * the inefficient byte manipulations in the
 		 * architecture independent code.
 		 */
-.done:		adc	r0, sum, #0
+.Ldone:		adc	r0, sum, #0
 		ldr	sum, [sp, #0]		@ dst
 		tst	sum, #1
 		movne	r0, r0, ror #8
-		load_regs	ea
+		load_regs
 
-.src_not_aligned:
+.Lsrc_not_aligned:
 		adc	sum, sum, #0		@ include C from dst alignment
 		and	ip, src, #3
 		bic	src, src, #3
 		load1l	r5
 		cmp	ip, #2
-		beq	.src2_aligned
-		bhi	.src3_aligned
-		mov	r4, r5, pull #8		@ C = 0
+		beq	.Lsrc2_aligned
+		bhi	.Lsrc3_aligned
+		mov	r4, r5, lspull #8		@ C = 0
 		bics	ip, len, #15
 		beq	2f
 1:		load4l	r5, r6, r7, r8
-		orr	r4, r4, r5, push #24
-		mov	r5, r5, pull #8
-		orr	r5, r5, r6, push #24
-		mov	r6, r6, pull #8
-		orr	r6, r6, r7, push #24
-		mov	r7, r7, pull #8
-		orr	r7, r7, r8, push #24
+		orr	r4, r4, r5, lspush #24
+		mov	r5, r5, lspull #8
+		orr	r5, r5, r6, lspush #24
+		mov	r6, r6, lspull #8
+		orr	r6, r6, r7, lspush #24
+		mov	r7, r7, lspull #8
+		orr	r7, r7, r8, lspush #24
 		stmia	dst!, {r4, r5, r6, r7}
 		adcs	sum, sum, r4
 		adcs	sum, sum, r5
 		adcs	sum, sum, r6
 		adcs	sum, sum, r7
-		mov	r4, r8, pull #8
+		mov	r4, r8, lspull #8
 		sub	ip, ip, #16
 		teq	ip, #0
 		bne	1b
@@ -196,50 +196,50 @@ FN_ENTRY
 		tst	ip, #8
 		beq	3f
 		load2l	r5, r6
-		orr	r4, r4, r5, push #24
-		mov	r5, r5, pull #8
-		orr	r5, r5, r6, push #24
+		orr	r4, r4, r5, lspush #24
+		mov	r5, r5, lspull #8
+		orr	r5, r5, r6, lspush #24
 		stmia	dst!, {r4, r5}
 		adcs	sum, sum, r4
 		adcs	sum, sum, r5
-		mov	r4, r6, pull #8
+		mov	r4, r6, lspull #8
 		tst	ip, #4
 		beq	4f
 3:		load1l	r5
-		orr	r4, r4, r5, push #24
+		orr	r4, r4, r5, lspush #24
 		str	r4, [dst], #4
 		adcs	sum, sum, r4
-		mov	r4, r5, pull #8
+		mov	r4, r5, lspull #8
 4:		ands	len, len, #3
-		beq	.done
+		beq	.Ldone
 		mov	r5, r4, get_byte_0
 		tst	len, #2
-		beq	.exit
-		adcs	sum, sum, r4, push #16
+		beq	.Lexit
+		adcs	sum, sum, r4, lspush #16
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_1
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_2
-		b	.exit
+		b	.Lexit
 
-.src2_aligned:	mov	r4, r5, pull #16
+.Lsrc2_aligned:	mov	r4, r5, lspull #16
 		adds	sum, sum, #0
 		bics	ip, len, #15
 		beq	2f
 1:		load4l	r5, r6, r7, r8
-		orr	r4, r4, r5, push #16
-		mov	r5, r5, pull #16
-		orr	r5, r5, r6, push #16
-		mov	r6, r6, pull #16
-		orr	r6, r6, r7, push #16
-		mov	r7, r7, pull #16
-		orr	r7, r7, r8, push #16
+		orr	r4, r4, r5, lspush #16
+		mov	r5, r5, lspull #16
+		orr	r5, r5, r6, lspush #16
+		mov	r6, r6, lspull #16
+		orr	r6, r6, r7, lspush #16
+		mov	r7, r7, lspull #16
+		orr	r7, r7, r8, lspush #16
 		stmia	dst!, {r4, r5, r6, r7}
 		adcs	sum, sum, r4
 		adcs	sum, sum, r5
 		adcs	sum, sum, r6
 		adcs	sum, sum, r7
-		mov	r4, r8, pull #16
+		mov	r4, r8, lspull #16
 		sub	ip, ip, #16
 		teq	ip, #0
 		bne	1b
@@ -248,52 +248,52 @@ FN_ENTRY
 		tst	ip, #8
 		beq	3f
 		load2l	r5, r6
-		orr	r4, r4, r5, push #16
-		mov	r5, r5, pull #16
-		orr	r5, r5, r6, push #16
+		orr	r4, r4, r5, lspush #16
+		mov	r5, r5, lspull #16
+		orr	r5, r5, r6, lspush #16
 		stmia	dst!, {r4, r5}
 		adcs	sum, sum, r4
 		adcs	sum, sum, r5
-		mov	r4, r6, pull #16
+		mov	r4, r6, lspull #16
 		tst	ip, #4
 		beq	4f
 3:		load1l	r5
-		orr	r4, r4, r5, push #16
+		orr	r4, r4, r5, lspush #16
 		str	r4, [dst], #4
 		adcs	sum, sum, r4
-		mov	r4, r5, pull #16
+		mov	r4, r5, lspull #16
 4:		ands	len, len, #3
-		beq	.done
+		beq	.Ldone
 		mov	r5, r4, get_byte_0
 		tst	len, #2
-		beq	.exit
+		beq	.Lexit
 		adcs	sum, sum, r4
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_1
 		strb	r5, [dst], #1
 		tst	len, #1
-		beq	.done
+		beq	.Ldone
 		load1b	r5
-		b	.exit
+		b	.Lexit
 
-.src3_aligned:	mov	r4, r5, pull #24
+.Lsrc3_aligned:	mov	r4, r5, lspull #24
 		adds	sum, sum, #0
 		bics	ip, len, #15
 		beq	2f
 1:		load4l	r5, r6, r7, r8
-		orr	r4, r4, r5, push #8
-		mov	r5, r5, pull #24
-		orr	r5, r5, r6, push #8
-		mov	r6, r6, pull #24
-		orr	r6, r6, r7, push #8
-		mov	r7, r7, pull #24
-		orr	r7, r7, r8, push #8
+		orr	r4, r4, r5, lspush #8
+		mov	r5, r5, lspull #24
+		orr	r5, r5, r6, lspush #8
+		mov	r6, r6, lspull #24
+		orr	r6, r6, r7, lspush #8
+		mov	r7, r7, lspull #24
+		orr	r7, r7, r8, lspush #8
 		stmia	dst!, {r4, r5, r6, r7}
 		adcs	sum, sum, r4
 		adcs	sum, sum, r5
 		adcs	sum, sum, r6
 		adcs	sum, sum, r7
-		mov	r4, r8, pull #24
+		mov	r4, r8, lspull #24
 		sub	ip, ip, #16
 		teq	ip, #0
 		bne	1b
@@ -302,30 +302,31 @@ FN_ENTRY
 		tst	ip, #8
 		beq	3f
 		load2l	r5, r6
-		orr	r4, r4, r5, push #8
-		mov	r5, r5, pull #24
-		orr	r5, r5, r6, push #8
+		orr	r4, r4, r5, lspush #8
+		mov	r5, r5, lspull #24
+		orr	r5, r5, r6, lspush #8
 		stmia	dst!, {r4, r5}
 		adcs	sum, sum, r4
 		adcs	sum, sum, r5
-		mov	r4, r6, pull #24
+		mov	r4, r6, lspull #24
 		tst	ip, #4
 		beq	4f
 3:		load1l	r5
-		orr	r4, r4, r5, push #8
+		orr	r4, r4, r5, lspush #8
 		str	r4, [dst], #4
 		adcs	sum, sum, r4
-		mov	r4, r5, pull #24
+		mov	r4, r5, lspull #24
 4:		ands	len, len, #3
-		beq	.done
+		beq	.Ldone
 		mov	r5, r4, get_byte_0
 		tst	len, #2
-		beq	.exit
+		beq	.Lexit
 		strb	r5, [dst], #1
 		adcs	sum, sum, r4
 		load1l	r4
 		mov	r5, r4, get_byte_0
 		strb	r5, [dst], #1
-		adcs	sum, sum, r4, push #24
+		adcs	sum, sum, r4, lspush #24
 		mov	r5, r4, get_byte_1
-		b	.exit
+		b	.Lexit
+FN_EXIT
diff --git a/arch/arm/lib/csumpartialcopyuser.S b/arch/arm/lib/csumpartialcopyuser.S
index 333bca292de..7d08b43d2c0 100644
--- a/arch/arm/lib/csumpartialcopyuser.S
+++ b/arch/arm/lib/csumpartialcopyuser.S
@@ -18,58 +18,36 @@
 		.text
 
 		.macro	save_regs
-		stmfd	sp!, {r1 - r2, r4 - r8, fp, ip, lr, pc}
+		stmfd	sp!, {r1, r2, r4 - r8, lr}
 		.endm
 
-		.macro	load_regs,flags
-		ldm\flags	fp, {r1, r2, r4-r8, fp, sp, pc}
+		.macro	load_regs
+		ldmfd	sp!, {r1, r2, r4 - r8, pc}
 		.endm
 
 		.macro	load1b,	reg1
-9999:		ldrbt	\reg1, [r0], $1
-		.section __ex_table, "a"
-		.align	3
-		.long	9999b, 6001f
-		.previous
+		ldrusr	\reg1, r0, 1
 		.endm
 
 		.macro	load2b, reg1, reg2
-9999:		ldrbt	\reg1, [r0], $1
-9998:		ldrbt	\reg2, [r0], $1
-		.section __ex_table, "a"
-		.long	9999b, 6001f
-		.long	9998b, 6001f
-		.previous
+		ldrusr	\reg1, r0, 1
+		ldrusr	\reg2, r0, 1
 		.endm
 
 		.macro	load1l, reg1
-9999:		ldrt	\reg1, [r0], $4
-		.section __ex_table, "a"
-		.align	3
-		.long	9999b, 6001f
-		.previous
+		ldrusr	\reg1, r0, 4
 		.endm
 
 		.macro	load2l, reg1, reg2
-9999:		ldrt	\reg1, [r0], $4
-9998:		ldrt	\reg2, [r0], $4
-		.section __ex_table, "a"
-		.long	9999b, 6001f
-		.long	9998b, 6001f
-		.previous
+		ldrusr	\reg1, r0, 4
+		ldrusr	\reg2, r0, 4
 		.endm
 
 		.macro	load4l, reg1, reg2, reg3, reg4
-9999:		ldrt	\reg1, [r0], $4
-9998:		ldrt	\reg2, [r0], $4
-9997:		ldrt	\reg3, [r0], $4
-9996:		ldrt	\reg4, [r0], $4
-		.section __ex_table, "a"
-		.long	9999b, 6001f
-		.long	9998b, 6001f
-		.long	9997b, 6001f
-		.long	9996b, 6001f
-		.previous
+		ldrusr	\reg1, r0, 4
+		ldrusr	\reg2, r0, 4
+		ldrusr	\reg3, r0, 4
+		ldrusr	\reg4, r0, 4
 		.endm
 
 /*
@@ -80,6 +58,7 @@
  */
 
 #define FN_ENTRY	ENTRY(csum_partial_copy_from_user)
+#define FN_EXIT		ENDPROC(csum_partial_copy_from_user)
 
 #include "csumpartialcopygeneric.S"
 
@@ -89,16 +68,16 @@
  * so properly, we would have to add in whatever registers were loaded before
  * the fault, which, with the current asm above is not predictable.
  */
-		.section .fixup,"ax"
+		.pushsection .fixup,"ax"
 		.align	4
-6001:		mov	r4, #-EFAULT
-		ldr	r5, [fp, #4]		@ *err_ptr
+9001:		mov	r4, #-EFAULT
+		ldr	r5, [sp, #8*4]		@ *err_ptr
 		str	r4, [r5]
 		ldmia	sp, {r1, r2}		@ retrieve dst, len
 		add	r2, r2, r1
 		mov	r0, #0			@ zero the buffer
-6002:		teq	r2, r1
+9002:		teq	r2, r1
 		strneb	r0, [r1], #1
-		bne	6002b
-		load_regs	ea
-		.previous
+		bne	9002b
+		load_regs
+		.popsection
diff --git a/arch/arm/lib/delay-loop.S b/arch/arm/lib/delay-loop.S
new file mode 100644
index 00000000000..bc1033b897b
--- /dev/null
+++ b/arch/arm/lib/delay-loop.S
@@ -0,0 +1,68 @@
+/*
+ *  linux/arch/arm/lib/delay.S
+ *
+ *  Copyright (C) 1995, 1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/delay.h>
+		.text
+
+.LC0:		.word	loops_per_jiffy
+.LC1:		.word	UDELAY_MULT
+
+/*
+ * r0  <= 2000
+ * lpj <= 0x01ffffff (max. 3355 bogomips)
+ * HZ  <= 1000
+ */
+
+ENTRY(__loop_udelay)
+		ldr	r2, .LC1
+		mul	r0, r2, r0
+ENTRY(__loop_const_udelay)			@ 0 <= r0 <= 0x7fffff06
+		mov	r1, #-1
+		ldr	r2, .LC0
+		ldr	r2, [r2]		@ max = 0x01ffffff
+		add	r0, r0, r1, lsr #32-14
+		mov	r0, r0, lsr #14		@ max = 0x0001ffff
+		add	r2, r2, r1, lsr #32-10
+		mov	r2, r2, lsr #10		@ max = 0x00007fff
+		mul	r0, r2, r0		@ max = 2^32-1
+		add	r0, r0, r1, lsr #32-6
+		movs	r0, r0, lsr #6
+		moveq	pc, lr
+
+/*
+ * loops = r0 * HZ * loops_per_jiffy / 1000000
+ */
+		.align 3
+
+@ Delay routine
+ENTRY(__loop_delay)
+		subs	r0, r0, #1
+#if 0
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+#endif
+		bhi	__loop_delay
+		mov	pc, lr
+ENDPROC(__loop_udelay)
+ENDPROC(__loop_const_udelay)
+ENDPROC(__loop_delay)
diff --git a/arch/arm/lib/delay.S b/arch/arm/lib/delay.S
deleted file mode 100644
index 3c7f7e675dd..00000000000
--- a/arch/arm/lib/delay.S
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  linux/arch/arm/lib/delay.S
- *
- *  Copyright (C) 1995, 1996 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-		.text
-
-LC0:		.word	loops_per_jiffy
-
-/*
- * 0 <= r0 <= 2000
- */
-ENTRY(__udelay)
-		mov	r2,     #0x6800
-		orr	r2, r2, #0x00db
-		mul	r0, r2, r0
-ENTRY(__const_udelay)				@ 0 <= r0 <= 0x01ffffff
-		ldr	r2, LC0
-		ldr	r2, [r2]		@ max = 0x0fffffff
-		mov	r0, r0, lsr #11		@ max = 0x00003fff
-		mov	r2, r2, lsr #11		@ max = 0x0003ffff
-		mul	r0, r2, r0		@ max = 2^32-1
-		movs	r0, r0, lsr #6
-		RETINSTR(moveq,pc,lr)
-
-/*
- * loops = (r0 * 0x10c6 * 100 * loops_per_jiffy) / 2^32
- *
- * Oh, if only we had a cycle counter...
- */
-
-@ Delay routine
-ENTRY(__delay)
-		subs	r0, r0, #1
-#if 0
-		RETINSTR(movls,pc,lr)
-		subs	r0, r0, #1
-		RETINSTR(movls,pc,lr)
-		subs	r0, r0, #1
-		RETINSTR(movls,pc,lr)
-		subs	r0, r0, #1
-		RETINSTR(movls,pc,lr)
-		subs	r0, r0, #1
-		RETINSTR(movls,pc,lr)
-		subs	r0, r0, #1
-		RETINSTR(movls,pc,lr)
-		subs	r0, r0, #1
-		RETINSTR(movls,pc,lr)
-		subs	r0, r0, #1
-#endif
-		bhi	__delay
-		RETINSTR(mov,pc,lr)
diff --git a/arch/arm/lib/delay.c b/arch/arm/lib/delay.c
new file mode 100644
index 00000000000..5306de35013
--- /dev/null
+++ b/arch/arm/lib/delay.c
@@ -0,0 +1,93 @@
+/*
+ * Delay loops based on the OpenRISC implementation.
+ *
+ * Copyright (C) 2012 ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timex.h>
+
+/*
+ * Default to the loop-based delay implementation.
+ */
+struct arm_delay_ops arm_delay_ops = {
+	.delay		= __loop_delay,
+	.const_udelay	= __loop_const_udelay,
+	.udelay		= __loop_udelay,
+};
+
+static const struct delay_timer *delay_timer;
+static bool delay_calibrated;
+
+int read_current_timer(unsigned long *timer_val)
+{
+	if (!delay_timer)
+		return -ENXIO;
+
+	*timer_val = delay_timer->read_current_timer();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(read_current_timer);
+
+static void __timer_delay(unsigned long cycles)
+{
+	cycles_t start = get_cycles();
+
+	while ((get_cycles() - start) < cycles)
+		cpu_relax();
+}
+
+static void __timer_const_udelay(unsigned long xloops)
+{
+	unsigned long long loops = xloops;
+	loops *= arm_delay_ops.ticks_per_jiffy;
+	__timer_delay(loops >> UDELAY_SHIFT);
+}
+
+static void __timer_udelay(unsigned long usecs)
+{
+	__timer_const_udelay(usecs * UDELAY_MULT);
+}
+
+void __init register_current_timer_delay(const struct delay_timer *timer)
+{
+	if (!delay_calibrated) {
+		pr_info("Switching to timer-based delay loop\n");
+		delay_timer			= timer;
+		lpj_fine			= timer->freq / HZ;
+
+		/* cpufreq may scale loops_per_jiffy, so keep a private copy */
+		arm_delay_ops.ticks_per_jiffy	= lpj_fine;
+		arm_delay_ops.delay		= __timer_delay;
+		arm_delay_ops.const_udelay	= __timer_const_udelay;
+		arm_delay_ops.udelay		= __timer_udelay;
+
+		delay_calibrated		= true;
+	} else {
+		pr_info("Ignoring duplicate/late registration of read_current_timer delay\n");
+	}
+}
+
+unsigned long calibrate_delay_is_known(void)
+{
+	delay_calibrated = true;
+	return lpj_fine;
+}
diff --git a/arch/arm/lib/div64.S b/arch/arm/lib/div64.S
index ec9a1cd6176..e55c4842c29 100644
--- a/arch/arm/lib/div64.S
+++ b/arch/arm/lib/div64.S
@@ -13,6 +13,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/unwind.h>
 
 #ifdef __ARMEB__
 #define xh r0
@@ -44,6 +45,7 @@
  */
 
 ENTRY(__do_div64)
+UNWIND(.fnstart)
 
 	@ Test for easy paths first.
 	subs	ip, r4, #1
@@ -177,7 +179,9 @@ ENTRY(__do_div64)
 	mov	yh, xh, lsr ip
 	mov	yl, xl, lsr ip
 	rsb	ip, ip, #32
-	orr	yl, yl, xh, lsl ip
+ ARM(	orr	yl, yl, xh, lsl ip	)
+ THUMB(	lsl	xh, xh, ip		)
+ THUMB(	orr	yl, yl, xh		)
 	mov	xh, xl, lsl ip
 	mov	xh, xh, lsr ip
 	mov	pc, lr
@@ -187,14 +191,21 @@ ENTRY(__do_div64)
 	moveq	yh, xh
 	moveq	xh, #0
 	moveq	pc, lr
+UNWIND(.fnend)
 
+UNWIND(.fnstart)
+UNWIND(.pad #4)
+UNWIND(.save {lr})
+Ldiv0_64:
 	@ Division by 0:
-	str	lr, [sp, #-4]!
+	str	lr, [sp, #-8]!
 	bl	__div0
 
 	@ as wrong as it could be...
 	mov	yl, #0
 	mov	yh, #0
 	mov	xh, #0
-	ldr	pc, [sp], #4
+	ldr	pc, [sp], #8
 
+UNWIND(.fnend)
+ENDPROC(__do_div64)
diff --git a/arch/arm/lib/ecard.S b/arch/arm/lib/ecard.S
index fb7b602a6f7..e6057fa851b 100644
--- a/arch/arm/lib/ecard.S
+++ b/arch/arm/lib/ecard.S
@@ -12,7 +12,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/hardware.h>
 
 #define CPSR2SPSR(rt) \
 		mrs	rt, cpsr; \
@@ -29,7 +28,7 @@ ENTRY(ecard_loader_read)
 		CPSR2SPSR(r0)
 		mov	lr, pc
 		mov	pc, r2
-		LOADREGS(fd, sp!, {r4 - r12, pc})
+		ldmfd	sp!, {r4 - r12, pc}
 
 @ Purpose: call an expansion card loader to reset the card
 @ Proto  : void read_loader(int card_base, char *loader);
@@ -41,5 +40,5 @@ ENTRY(ecard_loader_reset)
 		CPSR2SPSR(r0)
 		mov	lr, pc
 		add	pc, r1, #8
-		LOADREGS(fd, sp!, {r4 - r12, pc})
+		ldmfd	sp!, {r4 - r12, pc}
 
diff --git a/arch/arm/lib/findbit.S b/arch/arm/lib/findbit.S
index f055d56ea68..64f6bc1a913 100644
--- a/arch/arm/lib/findbit.S
+++ b/arch/arm/lib/findbit.S
@@ -25,14 +25,18 @@ ENTRY(_find_first_zero_bit_le)
 		teq	r1, #0	
 		beq	3f
 		mov	r2, #0
-1:		ldrb	r3, [r0, r2, lsr #3]
+1:
+ ARM(		ldrb	r3, [r0, r2, lsr #3]	)
+ THUMB(		lsr	r3, r2, #3		)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		eors	r3, r3, #0xff		@ invert bits
-		bne	.found			@ any now set - found zero bit
+		bne	.L_found		@ any now set - found zero bit
 		add	r2, r2, #8		@ next bit pointer
 2:		cmp	r2, r1			@ any more?
 		blo	1b
 3:		mov	r0, r1			@ no free bits
-		RETINSTR(mov,pc,lr)
+		mov	pc, lr
+ENDPROC(_find_first_zero_bit_le)
 
 /*
  * Purpose  : Find next 'zero' bit
@@ -43,13 +47,16 @@ ENTRY(_find_next_zero_bit_le)
 		beq	3b
 		ands	ip, r2, #7
 		beq	1b			@ If new byte, goto old routine
-		ldrb	r3, [r0, r2, lsr #3]
+ ARM(		ldrb	r3, [r0, r2, lsr #3]	)
+ THUMB(		lsr	r3, r2, #3		)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		eor	r3, r3, #0xff		@ now looking for a 1 bit
 		movs	r3, r3, lsr ip		@ shift off unused bits
-		bne	.found
+		bne	.L_found
 		orr	r2, r2, #7		@ if zero, then no bits here
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
+ENDPROC(_find_next_zero_bit_le)
 
 /*
  * Purpose  : Find a 'one' bit
@@ -59,14 +66,18 @@ ENTRY(_find_first_bit_le)
 		teq	r1, #0	
 		beq	3f
 		mov	r2, #0
-1:		ldrb	r3, [r0, r2, lsr #3]
+1:
+ ARM(		ldrb	r3, [r0, r2, lsr #3]	)
+ THUMB(		lsr	r3, r2, #3		)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		movs	r3, r3
-		bne	.found			@ any now set - found zero bit
+		bne	.L_found		@ any now set - found zero bit
 		add	r2, r2, #8		@ next bit pointer
 2:		cmp	r2, r1			@ any more?
 		blo	1b
 3:		mov	r0, r1			@ no free bits
-		RETINSTR(mov,pc,lr)
+		mov	pc, lr
+ENDPROC(_find_first_bit_le)
 
 /*
  * Purpose  : Find next 'one' bit
@@ -77,12 +88,15 @@ ENTRY(_find_next_bit_le)
 		beq	3b
 		ands	ip, r2, #7
 		beq	1b			@ If new byte, goto old routine
-		ldrb	r3, [r0, r2, lsr #3]
+ ARM(		ldrb	r3, [r0, r2, lsr #3]	)
+ THUMB(		lsr	r3, r2, #3		)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		movs	r3, r3, lsr ip		@ shift off unused bits
-		bne	.found
+		bne	.L_found
 		orr	r2, r2, #7		@ if zero, then no bits here
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
+ENDPROC(_find_next_bit_le)
 
 #ifdef __ARMEB__
 
@@ -91,14 +105,17 @@ ENTRY(_find_first_zero_bit_be)
 		beq	3f
 		mov	r2, #0
 1:		eor	r3, r2, #0x18		@ big endian byte ordering
-		ldrb	r3, [r0, r3, lsr #3]
+ ARM(		ldrb	r3, [r0, r3, lsr #3]	)
+ THUMB(		lsr	r3, #3			)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		eors	r3, r3, #0xff		@ invert bits
-		bne	.found			@ any now set - found zero bit
+		bne	.L_found		@ any now set - found zero bit
 		add	r2, r2, #8		@ next bit pointer
 2:		cmp	r2, r1			@ any more?
 		blo	1b
 3:		mov	r0, r1			@ no free bits
-		RETINSTR(mov,pc,lr)
+		mov	pc, lr
+ENDPROC(_find_first_zero_bit_be)
 
 ENTRY(_find_next_zero_bit_be)
 		teq	r1, #0
@@ -106,27 +123,33 @@ ENTRY(_find_next_zero_bit_be)
 		ands	ip, r2, #7
 		beq	1b			@ If new byte, goto old routine
 		eor	r3, r2, #0x18		@ big endian byte ordering
-		ldrb	r3, [r0, r3, lsr #3]
+ ARM(		ldrb	r3, [r0, r3, lsr #3]	)
+ THUMB(		lsr	r3, #3			)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		eor	r3, r3, #0xff		@ now looking for a 1 bit
 		movs	r3, r3, lsr ip		@ shift off unused bits
-		bne	.found
+		bne	.L_found
 		orr	r2, r2, #7		@ if zero, then no bits here
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
+ENDPROC(_find_next_zero_bit_be)
 
 ENTRY(_find_first_bit_be)
 		teq	r1, #0
 		beq	3f
 		mov	r2, #0
 1:		eor	r3, r2, #0x18		@ big endian byte ordering
-		ldrb	r3, [r0, r3, lsr #3]
+ ARM(		ldrb	r3, [r0, r3, lsr #3]	)
+ THUMB(		lsr	r3, #3			)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		movs	r3, r3
-		bne	.found			@ any now set - found zero bit
+		bne	.L_found		@ any now set - found zero bit
 		add	r2, r2, #8		@ next bit pointer
 2:		cmp	r2, r1			@ any more?
 		blo	1b
 3:		mov	r0, r1			@ no free bits
-		RETINSTR(mov,pc,lr)
+		mov	pc, lr
+ENDPROC(_find_first_bit_be)
 
 ENTRY(_find_next_bit_be)
 		teq	r1, #0
@@ -134,22 +157,25 @@ ENTRY(_find_next_bit_be)
 		ands	ip, r2, #7
 		beq	1b			@ If new byte, goto old routine
 		eor	r3, r2, #0x18		@ big endian byte ordering
-		ldrb	r3, [r0, r3, lsr #3]
+ ARM(		ldrb	r3, [r0, r3, lsr #3]	)
+ THUMB(		lsr	r3, #3			)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		movs	r3, r3, lsr ip		@ shift off unused bits
-		bne	.found
+		bne	.L_found
 		orr	r2, r2, #7		@ if zero, then no bits here
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
+ENDPROC(_find_next_bit_be)
 
 #endif
 
 /*
  * One or more bits in the LSB of r3 are assumed to be set.
  */
-.found:
+.L_found:
 #if __LINUX_ARM_ARCH__ >= 5
-		rsb	r1, r3, #0
-		and	r3, r3, r1
+		rsb	r0, r3, #0
+		and	r3, r3, r0
 		clz	r3, r3
 		rsb	r3, r3, #31
 		add	r0, r2, r3
@@ -164,5 +190,7 @@ ENTRY(_find_next_bit_be)
 		addeq	r2, r2, #1
 		mov	r0, r2
 #endif
-		RETINSTR(mov,pc,lr)
+		cmp	r1, r0			@ Clamp to maxbit
+		movlo	r0, r1
+		mov	pc, lr
 
diff --git a/arch/arm/lib/getuser.S b/arch/arm/lib/getuser.S
index d204018070a..9b06bb41fca 100644
--- a/arch/arm/lib/getuser.S
+++ b/arch/arm/lib/getuser.S
@@ -16,63 +16,65 @@
  * __get_user_X
  *
  * Inputs:	r0 contains the address
+ *		r1 contains the address limit, which must be preserved
  * Outputs:	r0 is the error code
- *		r2, r3 contains the zero-extended value
+ *		r2 contains the zero-extended value
  *		lr corrupted
  *
- * No other registers must be altered.  (see include/asm-arm/uaccess.h
+ * No other registers must be altered.  (see <asm/uaccess.h>
  * for specific ASM register usage).
  *
  * Note that ADDR_LIMIT is either 0 or 0xc0000000.
  * Note also that it is intended that __get_user_bad is not global.
  */
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
+#include <linux/linkage.h>
+#include <asm/assembler.h>
 #include <asm/errno.h>
+#include <asm/domain.h>
 
-	.global	__get_user_1
-__get_user_1:
-1:	ldrbt	r2, [r0]
+ENTRY(__get_user_1)
+	check_uaccess r0, 1, r1, r2, __get_user_bad
+1: TUSER(ldrb)	r2, [r0]
 	mov	r0, #0
 	mov	pc, lr
+ENDPROC(__get_user_1)
 
-	.global	__get_user_2
-__get_user_2:
+ENTRY(__get_user_2)
+	check_uaccess r0, 2, r1, r2, __get_user_bad
+#ifdef CONFIG_CPU_USE_DOMAINS
+rb	.req	ip
 2:	ldrbt	r2, [r0], #1
-3:	ldrbt	r3, [r0]
+3:	ldrbt	rb, [r0], #0
+#else
+rb	.req	r0
+2:	ldrb	r2, [r0]
+3:	ldrb	rb, [r0, #1]
+#endif
 #ifndef __ARMEB__
-	orr	r2, r2, r3, lsl #8
+	orr	r2, r2, rb, lsl #8
 #else
-	orr	r2, r3, r2, lsl #8
+	orr	r2, rb, r2, lsl #8
 #endif
 	mov	r0, #0
 	mov	pc, lr
+ENDPROC(__get_user_2)
 
-	.global	__get_user_4
-__get_user_4:
-4:	ldrt	r2, [r0]
-	mov	r0, #0
-	mov	pc, lr
-
-	.global	__get_user_8
-__get_user_8:
-5:	ldrt	r2, [r0], #4
-6:	ldrt	r3, [r0]
+ENTRY(__get_user_4)
+	check_uaccess r0, 4, r1, r2, __get_user_bad
+4: TUSER(ldr)	r2, [r0]
 	mov	r0, #0
 	mov	pc, lr
+ENDPROC(__get_user_4)
 
-__get_user_bad_8:
-	mov	r3, #0
 __get_user_bad:
 	mov	r2, #0
 	mov	r0, #-EFAULT
 	mov	pc, lr
+ENDPROC(__get_user_bad)
 
-.section __ex_table, "a"
+.pushsection __ex_table, "a"
 	.long	1b, __get_user_bad
 	.long	2b, __get_user_bad
 	.long	3b, __get_user_bad
 	.long	4b, __get_user_bad
-	.long	5b, __get_user_bad_8
-	.long	6b, __get_user_bad_8
-.previous
+.popsection
diff --git a/arch/arm/lib/io-acorn.S b/arch/arm/lib/io-acorn.S
index 3aacd01d40e..69719bad674 100644
--- a/arch/arm/lib/io-acorn.S
+++ b/arch/arm/lib/io-acorn.S
@@ -11,14 +11,14 @@
  *
  */
 #include <linux/linkage.h>
+#include <linux/kern_levels.h>
 #include <asm/assembler.h>
-#include <asm/hardware.h>
 
 		.text
 		.align
 
-.iosl_warning:
-		.ascii	"<4>insl/outsl not implemented, called from %08lX\0"
+.Liosl_warning:
+		.ascii	KERN_WARNING "insl/outsl not implemented, called from %08lX\0"
 		.align
 
 /*
@@ -27,6 +27,6 @@
  */
 ENTRY(insl)
 ENTRY(outsl)
-		adr	r0, .iosl_warning
+		adr	r0, .Liosl_warning
 		mov	r1, lr
 		b	printk
diff --git a/arch/arm/lib/io-readsb.S b/arch/arm/lib/io-readsb.S
index 081ef749298..9f4238987fe 100644
--- a/arch/arm/lib/io-readsb.S
+++ b/arch/arm/lib/io-readsb.S
@@ -10,7 +10,7 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-.insb_align:	rsb	ip, ip, #4
+.Linsb_align:	rsb	ip, ip, #4
 		cmp	ip, r2
 		movgt	ip, r2
 		cmp	ip, #2
@@ -21,20 +21,20 @@
 		ldrgtb	r3, [r0]
 		strgtb	r3, [r1], #1
 		subs	r2, r2, ip
-		bne	.insb_aligned
+		bne	.Linsb_aligned
 
 ENTRY(__raw_readsb)
 		teq	r2, #0		@ do we have to check for the zero len?
 		moveq	pc, lr
 		ands	ip, r1, #3
-		bne	.insb_align
+		bne	.Linsb_align
 
-.insb_aligned:	stmfd	sp!, {r4 - r6, lr}
+.Linsb_aligned:	stmfd	sp!, {r4 - r6, lr}
 
 		subs	r2, r2, #16
-		bmi	.insb_no_16
+		bmi	.Linsb_no_16
 
-.insb_16_lp:	ldrb	r3, [r0]
+.Linsb_16_lp:	ldrb	r3, [r0]
 		ldrb	r4, [r0]
 		ldrb	r5, [r0]
 		mov	r3, r3,     put_byte_0
@@ -69,13 +69,13 @@ ENTRY(__raw_readsb)
 		stmia	r1!, {r3 - r6}
 
 		subs	r2, r2, #16
-		bpl	.insb_16_lp
+		bpl	.Linsb_16_lp
 
 		tst	r2, #15
-		LOADREGS(eqfd, sp!, {r4 - r6, pc})
+		ldmeqfd	sp!, {r4 - r6, pc}
 
-.insb_no_16:	tst	r2, #8
-		beq	.insb_no_8
+.Linsb_no_16:	tst	r2, #8
+		beq	.Linsb_no_8
 
 		ldrb	r3, [r0]
 		ldrb	r4, [r0]
@@ -95,8 +95,8 @@ ENTRY(__raw_readsb)
 		orr	r4, r4, ip, put_byte_3
 		stmia	r1!, {r3, r4}
 
-.insb_no_8:	tst	r2, #4
-		beq	.insb_no_4
+.Linsb_no_8:	tst	r2, #4
+		beq	.Linsb_no_4
 
 		ldrb	r3, [r0]
 		ldrb	r4, [r0]
@@ -108,8 +108,8 @@ ENTRY(__raw_readsb)
 		orr	r3, r3, r6, put_byte_3
 		str	r3, [r1], #4
 
-.insb_no_4:	ands	r2, r2, #3
-		LOADREGS(eqfd, sp!, {r4 - r6, pc})
+.Linsb_no_4:	ands	r2, r2, #3
+		ldmeqfd	sp!, {r4 - r6, pc}
 
 		cmp	r2, #2
 		ldrb	r3, [r0]
@@ -119,4 +119,5 @@ ENTRY(__raw_readsb)
 		ldrgtb	r3, [r0]
 		strgtb	r3, [r1]
 
-		LOADREGS(fd, sp!, {r4 - r6, pc})
+		ldmfd	sp!, {r4 - r6, pc}
+ENDPROC(__raw_readsb)
diff --git a/arch/arm/lib/io-readsl.S b/arch/arm/lib/io-readsl.S
index 75a9121cb23..7a7430950c7 100644
--- a/arch/arm/lib/io-readsl.S
+++ b/arch/arm/lib/io-readsl.S
@@ -47,25 +47,25 @@ ENTRY(__raw_readsl)
 		strb	ip, [r1], #1
 
 4:		subs	r2, r2, #1
-		mov	ip, r3, pull #24
+		mov	ip, r3, lspull #24
 		ldrne	r3, [r0]
-		orrne	ip, ip, r3, push #8
+		orrne	ip, ip, r3, lspush #8
 		strne	ip, [r1], #4
 		bne	4b
 		b	8f
 
 5:		subs	r2, r2, #1
-		mov	ip, r3, pull #16
+		mov	ip, r3, lspull #16
 		ldrne	r3, [r0]
-		orrne	ip, ip, r3, push #16
+		orrne	ip, ip, r3, lspush #16
 		strne	ip, [r1], #4
 		bne	5b
 		b	7f
 
 6:		subs	r2, r2, #1
-		mov	ip, r3, pull #8
+		mov	ip, r3, lspull #8
 		ldrne	r3, [r0]
-		orrne	ip, ip, r3, push #24
+		orrne	ip, ip, r3, lspush #24
 		strne	ip, [r1], #4
 		bne	6b
 
@@ -76,3 +76,4 @@ ENTRY(__raw_readsl)
 8:		mov	r3, ip, get_byte_0
 		strb	r3, [r1, #0]
 		mov	pc, lr
+ENDPROC(__raw_readsl)
diff --git a/arch/arm/lib/io-readsw-armv3.S b/arch/arm/lib/io-readsw-armv3.S
index 476cf7f8a63..88487c8c4f2 100644
--- a/arch/arm/lib/io-readsw-armv3.S
+++ b/arch/arm/lib/io-readsw-armv3.S
@@ -9,18 +9,17 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/hardware.h>
 
-.insw_bad_alignment:
-		adr	r0, .insw_bad_align_msg
+.Linsw_bad_alignment:
+		adr	r0, .Linsw_bad_align_msg
 		mov	r2, lr
 		b	panic
-.insw_bad_align_msg:
+.Linsw_bad_align_msg:
 		.asciz	"insw: bad buffer alignment (0x%p, lr=0x%08lX)\n"
 		.align
 
-.insw_align:	tst	r1, #1
-		bne	.insw_bad_alignment
+.Linsw_align:	tst	r1, #1
+		bne	.Linsw_bad_alignment
 
 		ldr	r3, [r0]
 		strb	r3, [r1], #1
@@ -28,22 +27,22 @@
 		strb	r3, [r1], #1
 
 		subs	r2, r2, #1
-		RETINSTR(moveq, pc, lr)
+		moveq	pc, lr
 
 ENTRY(__raw_readsw)
 		teq	r2, #0		@ do we have to check for the zero len?
 		moveq	pc, lr
 		tst	r1, #3
-		bne	.insw_align
+		bne	.Linsw_align
 
-.insw_aligned:	mov	ip, #0xff
+.Linsw_aligned:	mov	ip, #0xff
 		orr	ip, ip, ip, lsl #8
 		stmfd	sp!, {r4, r5, r6, lr}
 
 		subs	r2, r2, #8
-		bmi	.no_insw_8
+		bmi	.Lno_insw_8
 
-.insw_8_lp:	ldr	r3, [r0]
+.Linsw_8_lp:	ldr	r3, [r0]
 		and	r3, r3, ip
 		ldr	r4, [r0]
 		orr	r3, r3, r4, lsl #16
@@ -66,13 +65,13 @@ ENTRY(__raw_readsw)
 		stmia	r1!, {r3 - r6}
 
 		subs	r2, r2, #8
-		bpl	.insw_8_lp
+		bpl	.Linsw_8_lp
 
 		tst	r2, #7
-		LOADREGS(eqfd, sp!, {r4, r5, r6, pc})
+		ldmeqfd	sp!, {r4, r5, r6, pc}
 
-.no_insw_8:	tst	r2, #4
-		beq	.no_insw_4
+.Lno_insw_8:	tst	r2, #4
+		beq	.Lno_insw_4
 
 		ldr	r3, [r0]
 		and	r3, r3, ip
@@ -86,8 +85,8 @@ ENTRY(__raw_readsw)
 
 		stmia	r1!, {r3, r4}
 
-.no_insw_4:	tst	r2, #2
-		beq	.no_insw_2
+.Lno_insw_4:	tst	r2, #2
+		beq	.Lno_insw_2
 
 		ldr	r3, [r0]
 		and	r3, r3, ip
@@ -96,12 +95,12 @@ ENTRY(__raw_readsw)
 
 		str	r3, [r1], #4
 
-.no_insw_2:	tst	r2, #1
+.Lno_insw_2:	tst	r2, #1
 		ldrne	r3, [r0]
 		strneb	r3, [r1], #1
 		movne	r3, r3, lsr #8
 		strneb	r3, [r1]
 
-		LOADREGS(fd, sp!, {r4, r5, r6, pc})
+		ldmfd	sp!, {r4, r5, r6, pc}
 
 
diff --git a/arch/arm/lib/io-readsw-armv4.S b/arch/arm/lib/io-readsw-armv4.S
index c92b66ecbe8..1f393d42593 100644
--- a/arch/arm/lib/io-readsw-armv4.S
+++ b/arch/arm/lib/io-readsw-armv4.S
@@ -18,8 +18,8 @@
 #endif
 		.endm
 
-.insw_align:	movs	ip, r1, lsl #31
-		bne	.insw_noalign
+.Linsw_align:	movs	ip, r1, lsl #31
+		bne	.Linsw_noalign
 		ldrh	ip, [r0]
 		sub	r2, r2, #1
 		strh	ip, [r1], #2
@@ -28,14 +28,14 @@ ENTRY(__raw_readsw)
 		teq	r2, #0
 		moveq	pc, lr
 		tst	r1, #3
-		bne	.insw_align
+		bne	.Linsw_align
 
 		stmfd	sp!, {r4, r5, lr}
 
 		subs	r2, r2, #8
-		bmi	.no_insw_8
+		bmi	.Lno_insw_8
 
-.insw_8_lp:	ldrh	r3, [r0]
+.Linsw_8_lp:	ldrh	r3, [r0]
 		ldrh	r4, [r0]
 		pack	r3, r3, r4
 
@@ -53,10 +53,10 @@ ENTRY(__raw_readsw)
 
 		subs	r2, r2, #8
 		stmia	r1!, {r3 - r5, ip}
-		bpl	.insw_8_lp
+		bpl	.Linsw_8_lp
 
-.no_insw_8:	tst	r2, #4
-		beq	.no_insw_4
+.Lno_insw_8:	tst	r2, #4
+		beq	.Lno_insw_4
 
 		ldrh	r3, [r0]
 		ldrh	r4, [r0]
@@ -68,15 +68,15 @@ ENTRY(__raw_readsw)
 
 		stmia	r1!, {r3, r4}
 
-.no_insw_4:	movs	r2, r2, lsl #31
-		bcc	.no_insw_2
+.Lno_insw_4:	movs	r2, r2, lsl #31
+		bcc	.Lno_insw_2
 
 		ldrh	r3, [r0]
 		ldrh	ip, [r0]
 		pack	r3, r3, ip
 		str	r3, [r1], #4
 
-.no_insw_2:	ldrneh	r3, [r0]
+.Lno_insw_2:	ldrneh	r3, [r0]
 		strneh	r3, [r1]
 
 		ldmfd	sp!, {r4, r5, pc}
@@ -93,7 +93,7 @@ ENTRY(__raw_readsw)
 #define pull_hbyte1		lsr #8
 #endif
 
-.insw_noalign:	stmfd	sp!, {r4, lr}
+.Linsw_noalign:	stmfd	sp!, {r4, lr}
 		ldrccb	ip, [r1, #-1]!
 		bcc	1f
 
@@ -128,3 +128,4 @@ ENTRY(__raw_readsw)
    _BE_ONLY_(	movne	ip, ip, lsr #24		)
 		strneb	ip, [r1]
 		ldmfd	sp!, {r4, pc}
+ENDPROC(__raw_readsw)
diff --git a/arch/arm/lib/io-shark.c b/arch/arm/lib/io-shark.c
deleted file mode 100644
index 824253948f5..00000000000
--- a/arch/arm/lib/io-shark.c
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- *  linux/arch/arm/lib/io-shark.c
- *
- *  by Alexander Schulz
- *
- * derived from:
- * linux/arch/arm/lib/io-ebsa.S
- * Copyright (C) 1995, 1996 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
diff --git a/arch/arm/lib/io-writesb.S b/arch/arm/lib/io-writesb.S
index 70b2561bdb0..68b92f4acae 100644
--- a/arch/arm/lib/io-writesb.S
+++ b/arch/arm/lib/io-writesb.S
@@ -30,7 +30,7 @@
 #endif
 		.endm
 
-.outsb_align:	rsb	ip, ip, #4
+.Loutsb_align:	rsb	ip, ip, #4
 		cmp	ip, r2
 		movgt	ip, r2
 		cmp	ip, #2
@@ -41,45 +41,46 @@
 		ldrgtb	r3, [r1], #1
 		strgtb	r3, [r0]
 		subs	r2, r2, ip
-		bne	.outsb_aligned
+		bne	.Loutsb_aligned
 
 ENTRY(__raw_writesb)
 		teq	r2, #0		@ do we have to check for the zero len?
 		moveq	pc, lr
 		ands	ip, r1, #3
-		bne	.outsb_align
+		bne	.Loutsb_align
 
-.outsb_aligned:	stmfd	sp!, {r4, r5, lr}
+.Loutsb_aligned:
+		stmfd	sp!, {r4, r5, lr}
 
 		subs	r2, r2, #16
-		bmi	.outsb_no_16
+		bmi	.Loutsb_no_16
 
-.outsb_16_lp:	ldmia	r1!, {r3, r4, r5, ip}
+.Loutsb_16_lp:	ldmia	r1!, {r3, r4, r5, ip}
 		outword	r3
 		outword	r4
 		outword	r5
 		outword	ip
 		subs	r2, r2, #16
-		bpl	.outsb_16_lp
+		bpl	.Loutsb_16_lp
 
 		tst	r2, #15
-		LOADREGS(eqfd, sp!, {r4, r5, pc})
+		ldmeqfd	sp!, {r4, r5, pc}
 
-.outsb_no_16:	tst	r2, #8
-		beq	.outsb_no_8
+.Loutsb_no_16:	tst	r2, #8
+		beq	.Loutsb_no_8
 
 		ldmia	r1!, {r3, r4}
 		outword	r3
 		outword	r4
 
-.outsb_no_8:	tst	r2, #4
-		beq	.outsb_no_4
+.Loutsb_no_8:	tst	r2, #4
+		beq	.Loutsb_no_4
 
 		ldr	r3, [r1], #4
 		outword	r3
 
-.outsb_no_4:	ands	r2, r2, #3
-		LOADREGS(eqfd, sp!, {r4, r5, pc})
+.Loutsb_no_4:	ands	r2, r2, #3
+		ldmeqfd	sp!, {r4, r5, pc}
 
 		cmp	r2, #2
 		ldrb	r3, [r1], #1
@@ -89,4 +90,5 @@ ENTRY(__raw_writesb)
 		ldrgtb	r3, [r1]
 		strgtb	r3, [r0]
 
-		LOADREGS(fd, sp!, {r4, r5, pc})
+		ldmfd	sp!, {r4, r5, pc}
+ENDPROC(__raw_writesb)
diff --git a/arch/arm/lib/io-writesl.S b/arch/arm/lib/io-writesl.S
index f8f14dd227c..d0d104a0dd1 100644
--- a/arch/arm/lib/io-writesl.S
+++ b/arch/arm/lib/io-writesl.S
@@ -41,26 +41,27 @@ ENTRY(__raw_writesl)
 		blt	5f
 		bgt	6f
 
-4:		mov	ip, r3, pull #16
+4:		mov	ip, r3, lspull #16
 		ldr	r3, [r1], #4
 		subs	r2, r2, #1
-		orr	ip, ip, r3, push #16
+		orr	ip, ip, r3, lspush #16
 		str	ip, [r0]
 		bne	4b
 		mov	pc, lr
 
-5:		mov	ip, r3, pull #8
+5:		mov	ip, r3, lspull #8
 		ldr	r3, [r1], #4
 		subs	r2, r2, #1
-		orr	ip, ip, r3, push #24
+		orr	ip, ip, r3, lspush #24
 		str	ip, [r0]
 		bne	5b
 		mov	pc, lr
 
-6:		mov	ip, r3, pull #24
+6:		mov	ip, r3, lspull #24
 		ldr	r3, [r1], #4
 		subs	r2, r2, #1
-		orr	ip, ip, r3, push #8
+		orr	ip, ip, r3, lspush #8
 		str	ip, [r0]
 		bne	6b
 		mov	pc, lr
+ENDPROC(__raw_writesl)
diff --git a/arch/arm/lib/io-writesw-armv3.S b/arch/arm/lib/io-writesw-armv3.S
index 950e7e310f1..49b800419e3 100644
--- a/arch/arm/lib/io-writesw-armv3.S
+++ b/arch/arm/lib/io-writesw-armv3.S
@@ -9,18 +9,17 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/hardware.h>
 
-.outsw_bad_alignment:
-		adr	r0, .outsw_bad_align_msg
+.Loutsw_bad_alignment:
+		adr	r0, .Loutsw_bad_align_msg
 		mov	r2, lr
 		b	panic
-.outsw_bad_align_msg:
+.Loutsw_bad_align_msg:
 		.asciz	"outsw: bad buffer alignment (0x%p, lr=0x%08lX)\n"
 		.align
 
-.outsw_align:	tst	r1, #1
-		bne	.outsw_bad_alignment
+.Loutsw_align:	tst	r1, #1
+		bne	.Loutsw_bad_alignment
 
 		add	r1, r1, #2
 
@@ -29,20 +28,20 @@
 		orr	r3, r3, r3, lsl #16
 		str	r3, [r0]
 		subs	r2, r2, #1
-		RETINSTR(moveq, pc, lr)
+		moveq	pc, lr
 
 ENTRY(__raw_writesw)
 		teq	r2, #0		@ do we have to check for the zero len?
 		moveq	pc, lr
 		tst	r1, #3
-		bne	.outsw_align
+		bne	.Loutsw_align
 
-.outsw_aligned:	stmfd	sp!, {r4, r5, r6, lr}
+		stmfd	sp!, {r4, r5, r6, lr}
 
 		subs	r2, r2, #8
-		bmi	.no_outsw_8
+		bmi	.Lno_outsw_8
 
-.outsw_8_lp:	ldmia	r1!, {r3, r4, r5, r6}
+.Loutsw_8_lp:	ldmia	r1!, {r3, r4, r5, r6}
 
 		mov	ip, r3, lsl #16
 		orr	ip, ip, ip, lsr #16
@@ -77,13 +76,13 @@ ENTRY(__raw_writesw)
 		str	ip, [r0]
 
 		subs	r2, r2, #8
-		bpl	.outsw_8_lp
+		bpl	.Loutsw_8_lp
 
 		tst	r2, #7
-		LOADREGS(eqfd, sp!, {r4, r5, r6, pc})
+		ldmeqfd	sp!, {r4, r5, r6, pc}
 
-.no_outsw_8:	tst	r2, #4
-		beq	.no_outsw_4
+.Lno_outsw_8:	tst	r2, #4
+		beq	.Lno_outsw_4
 
 		ldmia	r1!, {r3, r4}
 
@@ -103,8 +102,8 @@ ENTRY(__raw_writesw)
 		orr	ip, ip, ip, lsl #16
 		str	ip, [r0]
 
-.no_outsw_4:	tst	r2, #2
-		beq	.no_outsw_2
+.Lno_outsw_4:	tst	r2, #2
+		beq	.Lno_outsw_2
 
 		ldr	r3, [r1], #4
 
@@ -116,7 +115,7 @@ ENTRY(__raw_writesw)
 		orr	ip, ip, ip, lsl #16
 		str	ip, [r0]
 
-.no_outsw_2:	tst	r2, #1
+.Lno_outsw_2:	tst	r2, #1
 
 		ldrne	r3, [r1]
 
@@ -124,4 +123,4 @@ ENTRY(__raw_writesw)
 		orrne	ip, ip, ip, lsr #16
 		strne	ip, [r0]
 
-		LOADREGS(fd, sp!, {r4, r5, r6, pc})
+		ldmfd	sp!, {r4, r5, r6, pc}
diff --git a/arch/arm/lib/io-writesw-armv4.S b/arch/arm/lib/io-writesw-armv4.S
index 5e240e452af..ff4f71b579e 100644
--- a/arch/arm/lib/io-writesw-armv4.S
+++ b/arch/arm/lib/io-writesw-armv4.S
@@ -22,8 +22,8 @@
 #endif
 		.endm
 
-.outsw_align:	movs	ip, r1, lsl #31
-		bne	.outsw_noalign
+.Loutsw_align:	movs	ip, r1, lsl #31
+		bne	.Loutsw_noalign
 
 		ldrh	r3, [r1], #2
 		sub	r2, r2, #1
@@ -33,35 +33,35 @@ ENTRY(__raw_writesw)
 		teq	r2, #0
 		moveq	pc, lr
 		ands	r3, r1, #3
-		bne	.outsw_align
+		bne	.Loutsw_align
 
 		stmfd	sp!, {r4, r5, lr}
 
 		subs	r2, r2, #8
-		bmi	.no_outsw_8
+		bmi	.Lno_outsw_8
 
-.outsw_8_lp:	ldmia	r1!, {r3, r4, r5, ip}
+.Loutsw_8_lp:	ldmia	r1!, {r3, r4, r5, ip}
 		subs	r2, r2, #8
 		outword	r3
 		outword	r4
 		outword	r5
 		outword	ip
-		bpl	.outsw_8_lp
+		bpl	.Loutsw_8_lp
 
-.no_outsw_8:	tst	r2, #4
-		beq	.no_outsw_4
+.Lno_outsw_8:	tst	r2, #4
+		beq	.Lno_outsw_4
 
 		ldmia	r1!, {r3, ip}
 		outword	r3
 		outword	ip
 
-.no_outsw_4:	movs	r2, r2, lsl #31
-		bcc	.no_outsw_2
+.Lno_outsw_4:	movs	r2, r2, lsl #31
+		bcc	.Lno_outsw_2
 
 		ldr	r3, [r1], #4
 		outword	r3
 
-.no_outsw_2:	ldrneh	r3, [r1]
+.Lno_outsw_2:	ldrneh	r3, [r1]
 		strneh	r3, [r0]
 
 		ldmfd	sp!, {r4, r5, pc}
@@ -74,7 +74,11 @@ ENTRY(__raw_writesw)
 #define push_hbyte1	lsl #8
 #endif
 
-.outsw_noalign:	ldr	r3, [r1, -r3]!
+.Loutsw_noalign:
+ ARM(		ldr	r3, [r1, -r3]!	)
+ THUMB(		rsb	r3, r3, #0	)
+ THUMB(		ldr	r3, [r1, r3]	)
+ THUMB(		sub	r1, r3		)
 		subcs	r2, r2, #1
 		bcs	2f
 		subs	r2, r2, #2
@@ -93,3 +97,4 @@ ENTRY(__raw_writesw)
 3:		movne	ip, r3, lsr #8
 		strneh	ip, [r0]
 		mov	pc, lr
+ENDPROC(__raw_writesw)
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index 59026029d01..c562f649734 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -1,7 +1,7 @@
 /*
  * linux/arch/arm/lib/lib1funcs.S: Optimized ARM division routines
  *
- * Author: Nicolas Pitre <nico@cam.org>
+ * Author: Nicolas Pitre <nico@fluxnic.net>
  *   - contributed to gcc-3.4 on Sep 30, 2003
  *   - adapted for the Linux kernel on Oct 2, 2003
  */
@@ -35,7 +35,7 @@ Boston, MA 02111-1307, USA.  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-
+#include <asm/unwind.h>
 
 .macro ARM_DIV_BODY dividend, divisor, result, curbit
 
@@ -206,6 +206,8 @@ Boston, MA 02111-1307, USA.  */
 
 
 ENTRY(__udivsi3)
+ENTRY(__aeabi_uidiv)
+UNWIND(.fnstart)
 
 	subs	r2, r1, #1
 	moveq	pc, lr
@@ -229,8 +231,12 @@ ENTRY(__udivsi3)
 	mov	r0, r0, lsr r2
 	mov	pc, lr
 
+UNWIND(.fnend)
+ENDPROC(__udivsi3)
+ENDPROC(__aeabi_uidiv)
 
 ENTRY(__umodsi3)
+UNWIND(.fnstart)
 
 	subs	r2, r1, #1			@ compare divisor with 1
 	bcc	Ldiv0
@@ -244,8 +250,12 @@ ENTRY(__umodsi3)
 
 	mov	pc, lr
 
+UNWIND(.fnend)
+ENDPROC(__umodsi3)
 
 ENTRY(__divsi3)
+ENTRY(__aeabi_idiv)
+UNWIND(.fnstart)
 
 	cmp	r1, #0
 	eor	ip, r0, r1			@ save the sign of the result.
@@ -282,8 +292,12 @@ ENTRY(__divsi3)
 	rsbmi	r0, r0, #0
 	mov	pc, lr
 
+UNWIND(.fnend)
+ENDPROC(__divsi3)
+ENDPROC(__aeabi_idiv)
 
 ENTRY(__modsi3)
+UNWIND(.fnstart)
 
 	cmp	r1, #0
 	beq	Ldiv0
@@ -303,12 +317,47 @@ ENTRY(__modsi3)
 	rsbmi	r0, r0, #0
 	mov	pc, lr
 
+UNWIND(.fnend)
+ENDPROC(__modsi3)
 
-Ldiv0:
+#ifdef CONFIG_AEABI
 
-	str	lr, [sp, #-4]!
-	bl	__div0
-	mov	r0, #0			@ About as wrong as it could be.
-	ldr	pc, [sp], #4
+ENTRY(__aeabi_uidivmod)
+UNWIND(.fnstart)
+UNWIND(.save {r0, r1, ip, lr}	)
 
+	stmfd	sp!, {r0, r1, ip, lr}
+	bl	__aeabi_uidiv
+	ldmfd	sp!, {r1, r2, ip, lr}
+	mul	r3, r0, r2
+	sub	r1, r1, r3
+	mov	pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__aeabi_uidivmod)
+
+ENTRY(__aeabi_idivmod)
+UNWIND(.fnstart)
+UNWIND(.save {r0, r1, ip, lr}	)
+	stmfd	sp!, {r0, r1, ip, lr}
+	bl	__aeabi_idiv
+	ldmfd	sp!, {r1, r2, ip, lr}
+	mul	r3, r0, r2
+	sub	r1, r1, r3
+	mov	pc, lr
 
+UNWIND(.fnend)
+ENDPROC(__aeabi_idivmod)
+
+#endif
+
+Ldiv0:
+UNWIND(.fnstart)
+UNWIND(.pad #4)
+UNWIND(.save {lr})
+	str	lr, [sp, #-8]!
+	bl	__div0
+	mov	r0, #0			@ About as wrong as it could be.
+	ldr	pc, [sp], #8
+UNWIND(.fnend)
+ENDPROC(Ldiv0)
diff --git a/arch/arm/lib/lshrdi3.S b/arch/arm/lib/lshrdi3.S
index 46c2ed19ec9..f83d449141f 100644
--- a/arch/arm/lib/lshrdi3.S
+++ b/arch/arm/lib/lshrdi3.S
@@ -37,12 +37,17 @@ Boston, MA 02110-1301, USA.  */
 #endif
 
 ENTRY(__lshrdi3)
+ENTRY(__aeabi_llsr)
 
 	subs	r3, r2, #32
 	rsb	ip, r2, #32
 	movmi	al, al, lsr r2
 	movpl	al, ah, lsr r3
-	orrmi	al, al, ah, lsl ip
+ ARM(	orrmi	al, al, ah, lsl ip	)
+ THUMB(	lslmi	r3, ah, ip		)
+ THUMB(	orrmi	al, al, r3		)
 	mov	ah, ah, lsr r2
 	mov	pc, lr
 
+ENDPROC(__lshrdi3)
+ENDPROC(__aeabi_llsr)
diff --git a/arch/arm/lib/memchr.S b/arch/arm/lib/memchr.S
index ac34fe55d21..1da86991d70 100644
--- a/arch/arm/lib/memchr.S
+++ b/arch/arm/lib/memchr.S
@@ -22,4 +22,5 @@ ENTRY(memchr)
 	bne	1b
 	sub	r0, r0, #1
 2:	movne	r0, #0
-	RETINSTR(mov,pc,lr)
+	mov	pc, lr
+ENDPROC(memchr)
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index 7e71d6708a8..a9b9e2287a0 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -13,8 +13,11 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
+#define LDR1W_SHIFT	0
+#define STR1W_SHIFT	0
+
 	.macro ldr1w ptr reg abort
-	ldr \reg, [\ptr], #4
+	W(ldr) \reg, [\ptr], #4
 	.endm
 
 	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
@@ -30,7 +33,7 @@
 	.endm
 
 	.macro str1w ptr reg abort
-	str \reg, [\ptr], #4
+	W(str) \reg, [\ptr], #4
 	.endm
 
 	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
@@ -57,3 +60,4 @@ ENTRY(memcpy)
 
 #include "copy_template.S"
 
+ENDPROC(memcpy)
diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S
index ef7fddc14ac..d1fc0c0c342 100644
--- a/arch/arm/lib/memmove.S
+++ b/arch/arm/lib/memmove.S
@@ -13,14 +13,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
- * This can be used to enable code to cacheline align the source pointer.
- * Experiments on tested architectures (StrongARM and XScale) didn't show
- * this a worthwhile thing to do.  That might be different in the future.
- */
-//#define CALGN(code...)        code
-#define CALGN(code...)
-
 		.text
 
 /*
@@ -55,11 +47,12 @@ ENTRY(memmove)
 		stmfd	sp!, {r5 - r8}
 		blt	5f
 
-	CALGN(	ands	ip, r1, #31		)
+	CALGN(	ands	ip, r0, #31		)
 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
 	CALGN(	bcs	2f			)
 	CALGN(	adr	r4, 6f			)
 	CALGN(	subs	r2, r2, ip		)  @ C is set here
+	CALGN(	rsb	ip, ip, #32		)
 	CALGN(	add	pc, r4, ip		)
 
 	PLD(	pld	[r1, #-4]		)
@@ -81,25 +74,25 @@ ENTRY(memmove)
 		rsb	ip, ip, #32
 		addne	pc, pc, ip		@ C is always clear here
 		b	7f
-6:		nop
-		ldr	r3, [r1, #-4]!
-		ldr	r4, [r1, #-4]!
-		ldr	r5, [r1, #-4]!
-		ldr	r6, [r1, #-4]!
-		ldr	r7, [r1, #-4]!
-		ldr	r8, [r1, #-4]!
-		ldr	lr, [r1, #-4]!
+6:		W(nop)
+		W(ldr)	r3, [r1, #-4]!
+		W(ldr)	r4, [r1, #-4]!
+		W(ldr)	r5, [r1, #-4]!
+		W(ldr)	r6, [r1, #-4]!
+		W(ldr)	r7, [r1, #-4]!
+		W(ldr)	r8, [r1, #-4]!
+		W(ldr)	lr, [r1, #-4]!
 
 		add	pc, pc, ip
 		nop
-		nop
-		str	r3, [r0, #-4]!
-		str	r4, [r0, #-4]!
-		str	r5, [r0, #-4]!
-		str	r6, [r0, #-4]!
-		str	r7, [r0, #-4]!
-		str	r8, [r0, #-4]!
-		str	lr, [r0, #-4]!
+		W(nop)
+		W(str)	r3, [r0, #-4]!
+		W(str)	r4, [r0, #-4]!
+		W(str)	r5, [r0, #-4]!
+		W(str)	r6, [r0, #-4]!
+		W(str)	r7, [r0, #-4]!
+		W(str)	r8, [r0, #-4]!
+		W(str)	lr, [r0, #-4]!
 
 	CALGN(	bcs	2b			)
 
@@ -138,8 +131,7 @@ ENTRY(memmove)
 		subs	r2, r2, #28
 		blt	14f
 
-	CALGN(	ands	ip, r1, #31		)
-	CALGN(	rsb	ip, ip, #32		)
+	CALGN(	ands	ip, r0, #31		)
 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
 	CALGN(	subcc	r2, r2, ip		)
 	CALGN(	bcc	15f			)
@@ -155,24 +147,24 @@ ENTRY(memmove)
 
 12:	PLD(	pld	[r1, #-128]		)
 13:		ldmdb   r1!, {r7, r8, r9, ip}
-		mov     lr, r3, push #\push
+		mov     lr, r3, lspush #\push
 		subs    r2, r2, #32
 		ldmdb   r1!, {r3, r4, r5, r6}
-		orr     lr, lr, ip, pull #\pull
-		mov     ip, ip, push #\push
-		orr     ip, ip, r9, pull #\pull
-		mov     r9, r9, push #\push
-		orr     r9, r9, r8, pull #\pull
-		mov     r8, r8, push #\push
-		orr     r8, r8, r7, pull #\pull
-		mov     r7, r7, push #\push
-		orr     r7, r7, r6, pull #\pull
-		mov     r6, r6, push #\push
-		orr     r6, r6, r5, pull #\pull
-		mov     r5, r5, push #\push
-		orr     r5, r5, r4, pull #\pull
-		mov     r4, r4, push #\push
-		orr     r4, r4, r3, pull #\pull
+		orr     lr, lr, ip, lspull #\pull
+		mov     ip, ip, lspush #\push
+		orr     ip, ip, r9, lspull #\pull
+		mov     r9, r9, lspush #\push
+		orr     r9, r9, r8, lspull #\pull
+		mov     r8, r8, lspush #\push
+		orr     r8, r8, r7, lspull #\pull
+		mov     r7, r7, lspush #\push
+		orr     r7, r7, r6, lspull #\pull
+		mov     r6, r6, lspush #\push
+		orr     r6, r6, r5, lspull #\pull
+		mov     r5, r5, lspush #\push
+		orr     r5, r5, r4, lspull #\pull
+		mov     r4, r4, lspush #\push
+		orr     r4, r4, r3, lspull #\pull
 		stmdb   r0!, {r4 - r9, ip, lr}
 		bge	12b
 	PLD(	cmn	r2, #96			)
@@ -183,10 +175,10 @@ ENTRY(memmove)
 14:		ands	ip, r2, #28
 		beq	16f
 
-15:		mov     lr, r3, push #\push
+15:		mov     lr, r3, lspush #\push
 		ldr	r3, [r1, #-4]!
 		subs	ip, ip, #4
-		orr	lr, lr, r3, pull #\pull
+		orr	lr, lr, r3, lspull #\pull
 		str	lr, [r0, #-4]!
 		bgt	15b
 	CALGN(	cmp	r2, #0			)
@@ -204,3 +196,4 @@ ENTRY(memmove)
 
 18:		backward_copy_shift	push=24	pull=8
 
+ENDPROC(memmove)
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
index a1795f59993..94b0650ea98 100644
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -14,67 +14,110 @@
 
 	.text
 	.align	5
-	.word	0
-
-1:	subs	r2, r2, #4		@ 1 do we have enough
-	blt	5f			@ 1 bytes to align with?
-	cmp	r3, #2			@ 1
-	strltb	r1, [r0], #1		@ 1
-	strleb	r1, [r0], #1		@ 1
-	strb	r1, [r0], #1		@ 1
-	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
-/*
- * The pointer is now aligned and the length is adjusted.  Try doing the
- * memzero again.
- */
 
 ENTRY(memset)
 	ands	r3, r0, #3		@ 1 unaligned?
-	bne	1b			@ 1
+	mov	ip, r0			@ preserve r0 as return value
+	bne	6f			@ 1
 /*
- * we know that the pointer in r0 is aligned to a word boundary.
+ * we know that the pointer in ip is aligned to a word boundary.
  */
-	orr	r1, r1, r1, lsl #8
+1:	orr	r1, r1, r1, lsl #8
 	orr	r1, r1, r1, lsl #16
 	mov	r3, r1
 	cmp	r2, #16
 	blt	4f
+
+#if ! CALGN(1)+0
+
 /*
- * We need an extra register for this loop - save the return address and
- * use the LR
+ * We need 2 extra registers for this loop - use r8 and the LR
  */
-	str	lr, [sp, #-4]!
-	mov	ip, r1
+	stmfd	sp!, {r8, lr}
+	mov	r8, r1
 	mov	lr, r1
 
 2:	subs	r2, r2, #64
-	stmgeia	r0!, {r1, r3, ip, lr}	@ 64 bytes at a time.
-	stmgeia	r0!, {r1, r3, ip, lr}
-	stmgeia	r0!, {r1, r3, ip, lr}
-	stmgeia	r0!, {r1, r3, ip, lr}
+	stmgeia	ip!, {r1, r3, r8, lr}	@ 64 bytes at a time.
+	stmgeia	ip!, {r1, r3, r8, lr}
+	stmgeia	ip!, {r1, r3, r8, lr}
+	stmgeia	ip!, {r1, r3, r8, lr}
 	bgt	2b
-	LOADREGS(eqfd, sp!, {pc})	@ Now <64 bytes to go.
+	ldmeqfd	sp!, {r8, pc}		@ Now <64 bytes to go.
 /*
  * No need to correct the count; we're only testing bits from now on
  */
 	tst	r2, #32
-	stmneia	r0!, {r1, r3, ip, lr}
-	stmneia	r0!, {r1, r3, ip, lr}
+	stmneia	ip!, {r1, r3, r8, lr}
+	stmneia	ip!, {r1, r3, r8, lr}
 	tst	r2, #16
-	stmneia	r0!, {r1, r3, ip, lr}
-	ldr	lr, [sp], #4
+	stmneia	ip!, {r1, r3, r8, lr}
+	ldmfd	sp!, {r8, lr}
+
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+	stmfd	sp!, {r4-r8, lr}
+	mov	r4, r1
+	mov	r5, r1
+	mov	r6, r1
+	mov	r7, r1
+	mov	r8, r1
+	mov	lr, r1
+
+	cmp	r2, #96
+	tstgt	ip, #31
+	ble	3f
+
+	and	r8, ip, #31
+	rsb	r8, r8, #32
+	sub	r2, r2, r8
+	movs	r8, r8, lsl #(32 - 4)
+	stmcsia	ip!, {r4, r5, r6, r7}
+	stmmiia	ip!, {r4, r5}
+	tst	r8, #(1 << 30)
+	mov	r8, r1
+	strne	r1, [ip], #4
+
+3:	subs	r2, r2, #64
+	stmgeia	ip!, {r1, r3-r8, lr}
+	stmgeia	ip!, {r1, r3-r8, lr}
+	bgt	3b
+	ldmeqfd	sp!, {r4-r8, pc}
+
+	tst	r2, #32
+	stmneia	ip!, {r1, r3-r8, lr}
+	tst	r2, #16
+	stmneia	ip!, {r4-r7}
+	ldmfd	sp!, {r4-r8, lr}
+
+#endif
 
 4:	tst	r2, #8
-	stmneia	r0!, {r1, r3}
+	stmneia	ip!, {r1, r3}
 	tst	r2, #4
-	strne	r1, [r0], #4
+	strne	r1, [ip], #4
 /*
  * When we get here, we've got less than 4 bytes to zero.  We
  * may have an unaligned pointer as well.
  */
 5:	tst	r2, #2
-	strneb	r1, [r0], #1
-	strneb	r1, [r0], #1
+	strneb	r1, [ip], #1
+	strneb	r1, [ip], #1
 	tst	r2, #1
-	strneb	r1, [r0], #1
-	RETINSTR(mov,pc,lr)
+	strneb	r1, [ip], #1
+	mov	pc, lr
+
+6:	subs	r2, r2, #4		@ 1 do we have enough
+	blt	5b			@ 1 bytes to align with?
+	cmp	r3, #2			@ 1
+	strltb	r1, [ip], #1		@ 1
+	strleb	r1, [ip], #1		@ 1
+	strb	r1, [ip], #1		@ 1
+	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
+	b	1b
+ENDPROC(memset)
diff --git a/arch/arm/lib/memzero.S b/arch/arm/lib/memzero.S
index 51ccc60160f..3fbdef5f802 100644
--- a/arch/arm/lib/memzero.S
+++ b/arch/arm/lib/memzero.S
@@ -39,6 +39,9 @@ ENTRY(__memzero)
  */
 	cmp	r1, #16			@ 1 we can skip this chunk if we
 	blt	4f			@ 1 have < 16 bytes
+
+#if ! CALGN(1)+0
+
 /*
  * We need an extra register for this loop - save the return address and
  * use the LR
@@ -53,7 +56,7 @@ ENTRY(__memzero)
 	stmgeia	r0!, {r2, r3, ip, lr}	@ 4
 	stmgeia	r0!, {r2, r3, ip, lr}	@ 4
 	bgt	3b			@ 1
-	LOADREGS(eqfd, sp!, {pc})	@ 1/2 quick exit
+	ldmeqfd	sp!, {pc}		@ 1/2 quick exit
 /*
  * No need to correct the count; we're only testing bits from now on
  */
@@ -64,6 +67,47 @@ ENTRY(__memzero)
 	stmneia	r0!, {r2, r3, ip, lr}	@ 4
 	ldr	lr, [sp], #4		@ 1
 
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+	stmfd	sp!, {r4-r7, lr}
+	mov	r4, r2
+	mov	r5, r2
+	mov	r6, r2
+	mov	r7, r2
+	mov	ip, r2
+	mov	lr, r2
+
+	cmp	r1, #96
+	andgts	ip, r0, #31
+	ble	3f
+
+	rsb	ip, ip, #32
+	sub	r1, r1, ip
+	movs	ip, ip, lsl #(32 - 4)
+	stmcsia	r0!, {r4, r5, r6, r7}
+	stmmiia	r0!, {r4, r5}
+	movs	ip, ip, lsl #2
+	strcs	r2, [r0], #4
+
+3:	subs	r1, r1, #64
+	stmgeia	r0!, {r2-r7, ip, lr}
+	stmgeia	r0!, {r2-r7, ip, lr}
+	bgt	3b
+	ldmeqfd	sp!, {r4-r7, pc}
+
+	tst	r1, #32
+	stmneia	r0!, {r2-r7, ip, lr}
+	tst	r1, #16
+	stmneia	r0!, {r4-r7}
+	ldmfd	sp!, {r4-r7, lr}
+
+#endif
+
 4:	tst	r1, #8			@ 1 8 bytes or more?
 	stmneia	r0!, {r2, r3}		@ 2
 	tst	r1, #4			@ 1 4 bytes or more?
@@ -77,4 +121,5 @@ ENTRY(__memzero)
 	strneb	r2, [r0], #1		@ 1
 	tst	r1, #1			@ 1 a byte left over
 	strneb	r2, [r0], #1		@ 1
-	RETINSTR(mov,pc,lr)		@ 1
+	mov	pc, lr			@ 1
+ENDPROC(__memzero)
diff --git a/arch/arm/lib/muldi3.S b/arch/arm/lib/muldi3.S
index c7fbdf00531..36c91b4957e 100644
--- a/arch/arm/lib/muldi3.S
+++ b/arch/arm/lib/muldi3.S
@@ -25,11 +25,12 @@
 #endif
 
 ENTRY(__muldi3)
+ENTRY(__aeabi_lmul)
 
 	mul	xh, yl, xh
 	mla	xh, xl, yh, xh
-	mov	ip, xl, asr #16
-	mov	yh, yl, asr #16
+	mov	ip, xl, lsr #16
+	mov	yh, yl, lsr #16
 	bic	xl, xl, ip, lsl #16
 	bic	yl, yl, yh, lsl #16
 	mla	xh, yh, ip, xh
@@ -42,3 +43,5 @@ ENTRY(__muldi3)
 	adc	xh, xh, ip, lsr #16
 	mov	pc, lr
 
+ENDPROC(__muldi3)
+ENDPROC(__aeabi_lmul)
diff --git a/arch/arm/lib/putuser.S b/arch/arm/lib/putuser.S
index 4593e9c07f0..3d73dcb959b 100644
--- a/arch/arm/lib/putuser.S
+++ b/arch/arm/lib/putuser.S
@@ -16,61 +16,83 @@
  * __put_user_X
  *
  * Inputs:	r0 contains the address
+ *		r1 contains the address limit, which must be preserved
  *		r2, r3 contains the value
  * Outputs:	r0 is the error code
  *		lr corrupted
  *
- * No other registers must be altered.  (see include/asm-arm/uaccess.h
+ * No other registers must be altered.  (see <asm/uaccess.h>
  * for specific ASM register usage).
  *
  * Note that ADDR_LIMIT is either 0 or 0xc0000000
  * Note also that it is intended that __put_user_bad is not global.
  */
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
+#include <linux/linkage.h>
+#include <asm/assembler.h>
 #include <asm/errno.h>
+#include <asm/domain.h>
 
-	.global	__put_user_1
-__put_user_1:
-1:	strbt	r2, [r0]
+ENTRY(__put_user_1)
+	check_uaccess r0, 1, r1, ip, __put_user_bad
+1: TUSER(strb)	r2, [r0]
 	mov	r0, #0
 	mov	pc, lr
+ENDPROC(__put_user_1)
 
-	.global	__put_user_2
-__put_user_2:
+ENTRY(__put_user_2)
+	check_uaccess r0, 2, r1, ip, __put_user_bad
 	mov	ip, r2, lsr #8
+#ifdef CONFIG_THUMB2_KERNEL
 #ifndef __ARMEB__
-2:	strbt	r2, [r0], #1
-3:	strbt	ip, [r0]
+2: TUSER(strb)	r2, [r0]
+3: TUSER(strb)	ip, [r0, #1]
 #else
-2:	strbt	ip, [r0], #1
-3:	strbt	r2, [r0]
+2: TUSER(strb)	ip, [r0]
+3: TUSER(strb)	r2, [r0, #1]
 #endif
+#else	/* !CONFIG_THUMB2_KERNEL */
+#ifndef __ARMEB__
+2: TUSER(strb)	r2, [r0], #1
+3: TUSER(strb)	ip, [r0]
+#else
+2: TUSER(strb)	ip, [r0], #1
+3: TUSER(strb)	r2, [r0]
+#endif
+#endif	/* CONFIG_THUMB2_KERNEL */
 	mov	r0, #0
 	mov	pc, lr
+ENDPROC(__put_user_2)
 
-	.global	__put_user_4
-__put_user_4:
-4:	strt	r2, [r0]
+ENTRY(__put_user_4)
+	check_uaccess r0, 4, r1, ip, __put_user_bad
+4: TUSER(str)	r2, [r0]
 	mov	r0, #0
 	mov	pc, lr
+ENDPROC(__put_user_4)
 
-	.global	__put_user_8
-__put_user_8:
-5:	strt	r2, [r0], #4
-6:	strt	r3, [r0]
+ENTRY(__put_user_8)
+	check_uaccess r0, 8, r1, ip, __put_user_bad
+#ifdef CONFIG_THUMB2_KERNEL
+5: TUSER(str)	r2, [r0]
+6: TUSER(str)	r3, [r0, #4]
+#else
+5: TUSER(str)	r2, [r0], #4
+6: TUSER(str)	r3, [r0]
+#endif
 	mov	r0, #0
 	mov	pc, lr
+ENDPROC(__put_user_8)
 
 __put_user_bad:
 	mov	r0, #-EFAULT
 	mov	pc, lr
+ENDPROC(__put_user_bad)
 
-.section __ex_table, "a"
+.pushsection __ex_table, "a"
 	.long	1b, __put_user_bad
 	.long	2b, __put_user_bad
 	.long	3b, __put_user_bad
 	.long	4b, __put_user_bad
 	.long	5b, __put_user_bad
 	.long	6b, __put_user_bad
-.previous
+.popsection
diff --git a/arch/arm/lib/setbit.S b/arch/arm/lib/setbit.S
index 83bc23d5b03..618fedae4b3 100644
--- a/arch/arm/lib/setbit.S
+++ b/arch/arm/lib/setbit.S
@@ -12,11 +12,4 @@
 #include "bitops.h"
 		.text
 
-/*
- * Purpose  : Function to set a bit
- * Prototype: int set_bit(int bit, void *addr)
- */
-ENTRY(_set_bit_be)
-		eor	r0, r0, #0x18		@ big endian byte ordering
-ENTRY(_set_bit_le)
-	bitop	orr
+bitop	_set_bit, orr
diff --git a/arch/arm/lib/sha1.S b/arch/arm/lib/sha1.S
deleted file mode 100644
index ff6ece487ff..00000000000
--- a/arch/arm/lib/sha1.S
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- *  linux/arch/arm/lib/sha1.S
- *
- *  SHA transform optimized for ARM
- *
- *  Copyright:	(C) 2005 by Nicolas Pitre <nico@cam.org>
- *  Created:	September 17, 2005
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- *
- *  The reference implementation for this code is linux/lib/sha1.c
- */
-
-#include <linux/linkage.h>
-
-	.text
-
-
-/*
- * void sha_transform(__u32 *digest, const char *in, __u32 *W)
- *
- * Note: the "in" ptr may be unaligned.
- */
-
-ENTRY(sha_transform)
-
-	stmfd	sp!, {r4 - r8, lr}
-
-	@ for (i = 0; i < 16; i++)
-	@         W[i] = be32_to_cpu(in[i]); */
-
-#ifdef __ARMEB__
-	mov	r4, r0
-	mov	r0, r2
-	mov	r2, #64
-	bl	memcpy
-	mov	r2, r0
-	mov	r0, r4
-#else
-	mov	r3, r2
-	mov	lr, #16
-1:	ldrb	r4, [r1], #1
-	ldrb	r5, [r1], #1
-	ldrb	r6, [r1], #1
-	ldrb	r7, [r1], #1
-	subs	lr, lr, #1
-	orr	r5, r5, r4, lsl #8
-	orr	r6, r6, r5, lsl #8
-	orr	r7, r7, r6, lsl #8
-	str	r7, [r3], #4
-	bne	1b
-#endif
-
-	@ for (i = 0; i < 64; i++)
-	@         W[i+16] = ror(W[i+13] ^ W[i+8] ^ W[i+2] ^ W[i], 31);
-
-	sub	r3, r2, #4
-	mov	lr, #64
-2:	ldr	r4, [r3, #4]!
-	subs	lr, lr, #1
-	ldr	r5, [r3, #8]
-	ldr	r6, [r3, #32]
-	ldr	r7, [r3, #52]
-	eor	r4, r4, r5
-	eor	r4, r4, r6
-	eor	r4, r4, r7
-	mov	r4, r4, ror #31
-	str	r4, [r3, #64]
-	bne	2b
-
-	/*
-	 * The SHA functions are:
-	 *
-	 * f1(B,C,D) = (D ^ (B & (C ^ D)))
-	 * f2(B,C,D) = (B ^ C ^ D)
-	 * f3(B,C,D) = ((B & C) | (D & (B | C)))
-	 *
-	 * Then the sub-blocks are processed as follows:
-	 *
-	 * A' = ror(A, 27) + f(B,C,D) + E + K + *W++
-	 * B' = A
-	 * C' = ror(B, 2)
-	 * D' = C
-	 * E' = D
-	 *
-	 * We therefore unroll each loop 5 times to avoid register shuffling.
-	 * Also the ror for C (and also D and E which are successivelyderived
-	 * from it) is applied in place to cut on an additional mov insn for
-	 * each round.
-	 */
-
-	.macro	sha_f1, A, B, C, D, E
-	ldr	r3, [r2], #4
-	eor	ip, \C, \D
-	add	\E, r1, \E, ror #2
-	and	ip, \B, ip, ror #2
-	add	\E, \E, \A, ror #27
-	eor	ip, ip, \D, ror #2
-	add	\E, \E, r3
-	add	\E, \E, ip
-	.endm
-
-	.macro	sha_f2, A, B, C, D, E
-	ldr	r3, [r2], #4
-	add	\E, r1, \E, ror #2
-	eor	ip, \B, \C, ror #2
-	add	\E, \E, \A, ror #27
-	eor	ip, ip, \D, ror #2
-	add	\E, \E, r3
-	add	\E, \E, ip
-	.endm
-
-	.macro	sha_f3, A, B, C, D, E
-	ldr	r3, [r2], #4
-	add	\E, r1, \E, ror #2
-	orr	ip, \B, \C, ror #2
-	add	\E, \E, \A, ror #27
-	and	ip, ip, \D, ror #2
-	add	\E, \E, r3
-	and	r3, \B, \C, ror #2
-	orr	ip, ip, r3
-	add	\E, \E, ip
-	.endm
-
-	ldmia	r0, {r4 - r8}
-
-	mov	lr, #4
-	ldr	r1, .L_sha_K + 0
-
-	/* adjust initial values */
-	mov	r6, r6, ror #30
-	mov	r7, r7, ror #30
-	mov	r8, r8, ror #30
-
-3:	subs	lr, lr, #1
-	sha_f1	r4, r5, r6, r7, r8
-	sha_f1	r8, r4, r5, r6, r7
-	sha_f1	r7, r8, r4, r5, r6
-	sha_f1	r6, r7, r8, r4, r5
-	sha_f1	r5, r6, r7, r8, r4
-	bne	3b
-
-	ldr	r1, .L_sha_K + 4
-	mov	lr, #4
-
-4:	subs	lr, lr, #1
-	sha_f2	r4, r5, r6, r7, r8
-	sha_f2	r8, r4, r5, r6, r7
-	sha_f2	r7, r8, r4, r5, r6
-	sha_f2	r6, r7, r8, r4, r5
-	sha_f2	r5, r6, r7, r8, r4
-	bne	4b
-
-	ldr	r1, .L_sha_K + 8
-	mov	lr, #4
-
-5:	subs	lr, lr, #1
-	sha_f3	r4, r5, r6, r7, r8
-	sha_f3	r8, r4, r5, r6, r7
-	sha_f3	r7, r8, r4, r5, r6
-	sha_f3	r6, r7, r8, r4, r5
-	sha_f3	r5, r6, r7, r8, r4
-	bne	5b
-
-	ldr	r1, .L_sha_K + 12
-	mov	lr, #4
-
-6:	subs	lr, lr, #1
-	sha_f2	r4, r5, r6, r7, r8
-	sha_f2	r8, r4, r5, r6, r7
-	sha_f2	r7, r8, r4, r5, r6
-	sha_f2	r6, r7, r8, r4, r5
-	sha_f2	r5, r6, r7, r8, r4
-	bne	6b
-
-	ldmia	r0, {r1, r2, r3, ip, lr}
-	add	r4, r1, r4
-	add	r5, r2, r5
-	add	r6, r3, r6, ror #2
-	add	r7, ip, r7, ror #2
-	add	r8, lr, r8, ror #2
-	stmia	r0, {r4 - r8}
-
-	ldmfd	sp!, {r4 - r8, pc}
-
-.L_sha_K:
-	.word	0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
-
-
-/*
- * void sha_init(__u32 *buf)
- */
-
-.L_sha_initial_digest:
-	.word	0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0
-
-ENTRY(sha_init)
-
-	str	lr, [sp, #-4]!
-	adr	r1, .L_sha_initial_digest
-	ldmia	r1, {r1, r2, r3, ip, lr}
-	stmia	r0, {r1, r2, r3, ip, lr}
-	ldr	pc, [sp], #4
-
diff --git a/arch/arm/lib/strchr.S b/arch/arm/lib/strchr.S
index 5b9b493733f..d8f2a1c1aea 100644
--- a/arch/arm/lib/strchr.S
+++ b/arch/arm/lib/strchr.S
@@ -23,4 +23,5 @@ ENTRY(strchr)
 		teq	r2, r1
 		movne	r0, #0
 		subeq	r0, r0, #1
-		RETINSTR(mov,pc,lr)
+		mov	pc, lr
+ENDPROC(strchr)
diff --git a/arch/arm/lib/strncpy_from_user.S b/arch/arm/lib/strncpy_from_user.S
deleted file mode 100644
index 629cc877527..00000000000
--- a/arch/arm/lib/strncpy_from_user.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- *  linux/arch/arm/lib/strncpy_from_user.S
- *
- *  Copyright (C) 1995-2000 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/errno.h>
-
-	.text
-	.align	5
-
-/*
- * Copy a string from user space to kernel space.
- *  r0 = dst, r1 = src, r2 = byte length
- * returns the number of characters copied (strlen of copied string),
- *  -EFAULT on exception, or "len" if we fill the whole buffer
- */
-ENTRY(__arch_strncpy_from_user)
-	save_lr
-	mov	ip, r1
-1:	subs	r2, r2, #1
-USER(	ldrplbt	r3, [r1], #1)
-	bmi	2f
-	strb	r3, [r0], #1
-	teq	r3, #0
-	bne	1b
-	sub	r1, r1, #1	@ take NUL character out of count
-2:	sub	r0, r1, ip
-	restore_pc
-
-	.section .fixup,"ax"
-	.align	0
-9001:	mov	r3, #0
-	strb	r3, [r0, #0]	@ null terminate
-	mov	r0, #-EFAULT
-	restore_pc
-	.previous
-
diff --git a/arch/arm/lib/strnlen_user.S b/arch/arm/lib/strnlen_user.S
deleted file mode 100644
index 67bcd826812..00000000000
--- a/arch/arm/lib/strnlen_user.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- *  linux/arch/arm/lib/strnlen_user.S
- *
- *  Copyright (C) 1995-2000 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/errno.h>
-
-	.text
-	.align	5
-
-/* Prototype: unsigned long __arch_strnlen_user(const char *str, long n)
- * Purpose  : get length of a string in user memory
- * Params   : str - address of string in user memory
- * Returns  : length of string *including terminator*
- *	      or zero on exception, or n + 1 if too long
- */
-ENTRY(__arch_strnlen_user)
-	save_lr
-	mov	r2, r0
-1:
-USER(	ldrbt	r3, [r0], #1)
-	teq	r3, #0
-	beq	2f
-	subs	r1, r1, #1
-	bne	1b
-	add	r0, r0, #1
-2:	sub	r0, r0, r2
-	restore_pc
-
-	.section .fixup,"ax"
-	.align	0
-9001:	mov	r0, #0
-	restore_pc
-	.previous
diff --git a/arch/arm/lib/strrchr.S b/arch/arm/lib/strrchr.S
index fa923f026f1..302f20cd242 100644
--- a/arch/arm/lib/strrchr.S
+++ b/arch/arm/lib/strrchr.S
@@ -22,4 +22,5 @@ ENTRY(strrchr)
 		teq	r2, #0
 		bne	1b
 		mov	r0, r3
-		RETINSTR(mov,pc,lr)
+		mov	pc, lr
+ENDPROC(strrchr)
diff --git a/arch/arm/lib/testchangebit.S b/arch/arm/lib/testchangebit.S
index b25dcd2be53..4becdc3a59c 100644
--- a/arch/arm/lib/testchangebit.S
+++ b/arch/arm/lib/testchangebit.S
@@ -12,7 +12,4 @@
 #include "bitops.h"
                 .text
 
-ENTRY(_test_and_change_bit_be)
-		eor	r0, r0, #0x18		@ big endian byte ordering
-ENTRY(_test_and_change_bit_le)
-	testop	eor, strb
+testop	_test_and_change_bit, eor, str
diff --git a/arch/arm/lib/testclearbit.S b/arch/arm/lib/testclearbit.S
index 2dcc4b16b68..918841dcce7 100644
--- a/arch/arm/lib/testclearbit.S
+++ b/arch/arm/lib/testclearbit.S
@@ -12,7 +12,4 @@
 #include "bitops.h"
                 .text
 
-ENTRY(_test_and_clear_bit_be)
-		eor	r0, r0, #0x18		@ big endian byte ordering
-ENTRY(_test_and_clear_bit_le)
-	testop	bicne, strneb
+testop	_test_and_clear_bit, bicne, strne
diff --git a/arch/arm/lib/testsetbit.S b/arch/arm/lib/testsetbit.S
index 9011c969761..8d1b2fe9e48 100644
--- a/arch/arm/lib/testsetbit.S
+++ b/arch/arm/lib/testsetbit.S
@@ -12,7 +12,4 @@
 #include "bitops.h"
                 .text
 
-ENTRY(_test_and_set_bit_be)
-		eor	r0, r0, #0x18		@ big endian byte ordering
-ENTRY(_test_and_set_bit_le)
-	testop	orreq, streqb
+testop	_test_and_set_bit, orreq, streq
diff --git a/arch/arm/lib/uaccess.S b/arch/arm/lib/uaccess.S
index 6f1b5b49fe4..e50520904b7 100644
--- a/arch/arm/lib/uaccess.S
+++ b/arch/arm/lib/uaccess.S
@@ -14,12 +14,13 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/errno.h>
+#include <asm/domain.h>
 
 		.text
 
 #define PAGE_SHIFT 12
 
-/* Prototype: int __arch_copy_to_user(void *to, const char *from, size_t n)
+/* Prototype: int __copy_to_user(void *to, const char *from, size_t n)
  * Purpose  : copy a block to user memory from kernel memory
  * Params   : to   - user memory
  *          : from - kernel memory
@@ -27,42 +28,42 @@
  * Returns  : Number of bytes NOT copied.
  */
 
-.c2u_dest_not_aligned:
+.Lc2u_dest_not_aligned:
 		rsb	ip, ip, #4
 		cmp	ip, #2
 		ldrb	r3, [r1], #1
-USER(		strbt	r3, [r0], #1)			@ May fault
+USER(	TUSER(	strb)	r3, [r0], #1)			@ May fault
 		ldrgeb	r3, [r1], #1
-USER(		strgebt	r3, [r0], #1)			@ May fault
+USER(	TUSER(	strgeb) r3, [r0], #1)			@ May fault
 		ldrgtb	r3, [r1], #1
-USER(		strgtbt	r3, [r0], #1)			@ May fault
+USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
 		sub	r2, r2, ip
-		b	.c2u_dest_aligned
+		b	.Lc2u_dest_aligned
 
-ENTRY(__arch_copy_to_user)
+ENTRY(__copy_to_user)
 		stmfd	sp!, {r2, r4 - r7, lr}
 		cmp	r2, #4
-		blt	.c2u_not_enough
+		blt	.Lc2u_not_enough
 		ands	ip, r0, #3
-		bne	.c2u_dest_not_aligned
-.c2u_dest_aligned:
+		bne	.Lc2u_dest_not_aligned
+.Lc2u_dest_aligned:
 
 		ands	ip, r1, #3
-		bne	.c2u_src_not_aligned
+		bne	.Lc2u_src_not_aligned
 /*
  * Seeing as there has to be at least 8 bytes to copy, we can
  * copy one word, and force a user-mode page fault...
  */
 
-.c2u_0fupi:	subs	r2, r2, #4
+.Lc2u_0fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.c2u_0nowords
+		bmi	.Lc2u_0nowords
 		ldr	r3, [r1], #4
-USER(		strt	r3, [r0], #4)			@ May fault
+USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
 		mov	ip, r0, lsl #32 - PAGE_SHIFT	@ On each page, use a ld/st??t instruction
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.c2u_0fupi
+		beq	.Lc2u_0fupi
 /*
  * ip = max no. of bytes to copy before needing another "strt" insn
  */
@@ -70,16 +71,16 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #32
-		blt	.c2u_0rem8lp
+		blt	.Lc2u_0rem8lp
 
-.c2u_0cpy8lp:	ldmia	r1!, {r3 - r6}
+.Lc2u_0cpy8lp:	ldmia	r1!, {r3 - r6}
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
 		ldmia	r1!, {r3 - r6}
 		subs	ip, ip, #32
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
-		bpl	.c2u_0cpy8lp
+		bpl	.Lc2u_0cpy8lp
 
-.c2u_0rem8lp:	cmn	ip, #16
+.Lc2u_0rem8lp:	cmn	ip, #16
 		ldmgeia	r1!, {r3 - r6}
 		stmgeia	r0!, {r3 - r6}			@ Shouldnt fault
 		tst	ip, #8
@@ -87,244 +88,246 @@ USER(		strt	r3, [r0], #4)			@ May fault
 		stmneia	r0!, {r3 - r4}			@ Shouldnt fault
 		tst	ip, #4
 		ldrne	r3, [r1], #4
-		strnet	r3, [r0], #4			@ Shouldnt fault
+	TUSER(	strne) r3, [r0], #4			@ Shouldnt fault
 		ands	ip, ip, #3
-		beq	.c2u_0fupi
-.c2u_0nowords:	teq	ip, #0
-		beq	.c2u_finished
-.c2u_nowords:	cmp	ip, #2
+		beq	.Lc2u_0fupi
+.Lc2u_0nowords:	teq	ip, #0
+		beq	.Lc2u_finished
+.Lc2u_nowords:	cmp	ip, #2
 		ldrb	r3, [r1], #1
-USER(		strbt	r3, [r0], #1)			@ May fault
+USER(	TUSER(	strb)	r3, [r0], #1)			@ May fault
 		ldrgeb	r3, [r1], #1
-USER(		strgebt	r3, [r0], #1)			@ May fault
+USER(	TUSER(	strgeb) r3, [r0], #1)			@ May fault
 		ldrgtb	r3, [r1], #1
-USER(		strgtbt	r3, [r0], #1)			@ May fault
-		b	.c2u_finished
+USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
+		b	.Lc2u_finished
 
-.c2u_not_enough:
+.Lc2u_not_enough:
 		movs	ip, r2
-		bne	.c2u_nowords
-.c2u_finished:	mov	r0, #0
-		LOADREGS(fd,sp!,{r2, r4 - r7, pc})
+		bne	.Lc2u_nowords
+.Lc2u_finished:	mov	r0, #0
+		ldmfd	sp!, {r2, r4 - r7, pc}
 
-.c2u_src_not_aligned:
+.Lc2u_src_not_aligned:
 		bic	r1, r1, #3
 		ldr	r7, [r1], #4
 		cmp	ip, #2
-		bgt	.c2u_3fupi
-		beq	.c2u_2fupi
-.c2u_1fupi:	subs	r2, r2, #4
+		bgt	.Lc2u_3fupi
+		beq	.Lc2u_2fupi
+.Lc2u_1fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.c2u_1nowords
-		mov	r3, r7, pull #8
+		bmi	.Lc2u_1nowords
+		mov	r3, r7, lspull #8
 		ldr	r7, [r1], #4
-		orr	r3, r3, r7, push #24
-USER(		strt	r3, [r0], #4)			@ May fault
+		orr	r3, r3, r7, lspush #24
+USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
 		mov	ip, r0, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.c2u_1fupi
+		beq	.Lc2u_1fupi
 		cmp	r2, ip
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #16
-		blt	.c2u_1rem8lp
+		blt	.Lc2u_1rem8lp
 
-.c2u_1cpy8lp:	mov	r3, r7, pull #8
+.Lc2u_1cpy8lp:	mov	r3, r7, lspull #8
 		ldmia	r1!, {r4 - r7}
 		subs	ip, ip, #16
-		orr	r3, r3, r4, push #24
-		mov	r4, r4, pull #8
-		orr	r4, r4, r5, push #24
-		mov	r5, r5, pull #8
-		orr	r5, r5, r6, push #24
-		mov	r6, r6, pull #8
-		orr	r6, r6, r7, push #24
+		orr	r3, r3, r4, lspush #24
+		mov	r4, r4, lspull #8
+		orr	r4, r4, r5, lspush #24
+		mov	r5, r5, lspull #8
+		orr	r5, r5, r6, lspush #24
+		mov	r6, r6, lspull #8
+		orr	r6, r6, r7, lspush #24
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
-		bpl	.c2u_1cpy8lp
+		bpl	.Lc2u_1cpy8lp
 
-.c2u_1rem8lp:	tst	ip, #8
-		movne	r3, r7, pull #8
+.Lc2u_1rem8lp:	tst	ip, #8
+		movne	r3, r7, lspull #8
 		ldmneia	r1!, {r4, r7}
-		orrne	r3, r3, r4, push #24
-		movne	r4, r4, pull #8
-		orrne	r4, r4, r7, push #24
+		orrne	r3, r3, r4, lspush #24
+		movne	r4, r4, lspull #8
+		orrne	r4, r4, r7, lspush #24
 		stmneia	r0!, {r3 - r4}			@ Shouldnt fault
 		tst	ip, #4
-		movne	r3, r7, pull #8
+		movne	r3, r7, lspull #8
 		ldrne	r7, [r1], #4
-		orrne	r3, r3, r7, push #24
-		strnet	r3, [r0], #4			@ Shouldnt fault
+		orrne	r3, r3, r7, lspush #24
+	TUSER(	strne) r3, [r0], #4			@ Shouldnt fault
 		ands	ip, ip, #3
-		beq	.c2u_1fupi
-.c2u_1nowords:	mov	r3, r7, get_byte_1
+		beq	.Lc2u_1fupi
+.Lc2u_1nowords:	mov	r3, r7, get_byte_1
 		teq	ip, #0
-		beq	.c2u_finished
+		beq	.Lc2u_finished
 		cmp	ip, #2
-USER(		strbt	r3, [r0], #1)			@ May fault
+USER(	TUSER(	strb)	r3, [r0], #1)			@ May fault
 		movge	r3, r7, get_byte_2
-USER(		strgebt	r3, [r0], #1)			@ May fault
+USER(	TUSER(	strgeb) r3, [r0], #1)			@ May fault
 		movgt	r3, r7, get_byte_3
-USER(		strgtbt	r3, [r0], #1)			@ May fault
-		b	.c2u_finished
+USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
+		b	.Lc2u_finished
 
-.c2u_2fupi:	subs	r2, r2, #4
+.Lc2u_2fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.c2u_2nowords
-		mov	r3, r7, pull #16
+		bmi	.Lc2u_2nowords
+		mov	r3, r7, lspull #16
 		ldr	r7, [r1], #4
-		orr	r3, r3, r7, push #16
-USER(		strt	r3, [r0], #4)			@ May fault
+		orr	r3, r3, r7, lspush #16
+USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
 		mov	ip, r0, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.c2u_2fupi
+		beq	.Lc2u_2fupi
 		cmp	r2, ip
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #16
-		blt	.c2u_2rem8lp
+		blt	.Lc2u_2rem8lp
 
-.c2u_2cpy8lp:	mov	r3, r7, pull #16
+.Lc2u_2cpy8lp:	mov	r3, r7, lspull #16
 		ldmia	r1!, {r4 - r7}
 		subs	ip, ip, #16
-		orr	r3, r3, r4, push #16
-		mov	r4, r4, pull #16
-		orr	r4, r4, r5, push #16
-		mov	r5, r5, pull #16
-		orr	r5, r5, r6, push #16
-		mov	r6, r6, pull #16
-		orr	r6, r6, r7, push #16
+		orr	r3, r3, r4, lspush #16
+		mov	r4, r4, lspull #16
+		orr	r4, r4, r5, lspush #16
+		mov	r5, r5, lspull #16
+		orr	r5, r5, r6, lspush #16
+		mov	r6, r6, lspull #16
+		orr	r6, r6, r7, lspush #16
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
-		bpl	.c2u_2cpy8lp
+		bpl	.Lc2u_2cpy8lp
 
-.c2u_2rem8lp:	tst	ip, #8
-		movne	r3, r7, pull #16
+.Lc2u_2rem8lp:	tst	ip, #8
+		movne	r3, r7, lspull #16
 		ldmneia	r1!, {r4, r7}
-		orrne	r3, r3, r4, push #16
-		movne	r4, r4, pull #16
-		orrne	r4, r4, r7, push #16
+		orrne	r3, r3, r4, lspush #16
+		movne	r4, r4, lspull #16
+		orrne	r4, r4, r7, lspush #16
 		stmneia	r0!, {r3 - r4}			@ Shouldnt fault
 		tst	ip, #4
-		movne	r3, r7, pull #16
+		movne	r3, r7, lspull #16
 		ldrne	r7, [r1], #4
-		orrne	r3, r3, r7, push #16
-		strnet	r3, [r0], #4			@ Shouldnt fault
+		orrne	r3, r3, r7, lspush #16
+	TUSER(	strne) r3, [r0], #4			@ Shouldnt fault
 		ands	ip, ip, #3
-		beq	.c2u_2fupi
-.c2u_2nowords:	mov	r3, r7, get_byte_2
+		beq	.Lc2u_2fupi
+.Lc2u_2nowords:	mov	r3, r7, get_byte_2
 		teq	ip, #0
-		beq	.c2u_finished
+		beq	.Lc2u_finished
 		cmp	ip, #2
-USER(		strbt	r3, [r0], #1)			@ May fault
+USER(	TUSER(	strb)	r3, [r0], #1)			@ May fault
 		movge	r3, r7, get_byte_3
-USER(		strgebt	r3, [r0], #1)			@ May fault
+USER(	TUSER(	strgeb) r3, [r0], #1)			@ May fault
 		ldrgtb	r3, [r1], #0
-USER(		strgtbt	r3, [r0], #1)			@ May fault
-		b	.c2u_finished
+USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
+		b	.Lc2u_finished
 
-.c2u_3fupi:	subs	r2, r2, #4
+.Lc2u_3fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.c2u_3nowords
-		mov	r3, r7, pull #24
+		bmi	.Lc2u_3nowords
+		mov	r3, r7, lspull #24
 		ldr	r7, [r1], #4
-		orr	r3, r3, r7, push #8
-USER(		strt	r3, [r0], #4)			@ May fault
+		orr	r3, r3, r7, lspush #8
+USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
 		mov	ip, r0, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.c2u_3fupi
+		beq	.Lc2u_3fupi
 		cmp	r2, ip
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #16
-		blt	.c2u_3rem8lp
+		blt	.Lc2u_3rem8lp
 
-.c2u_3cpy8lp:	mov	r3, r7, pull #24
+.Lc2u_3cpy8lp:	mov	r3, r7, lspull #24
 		ldmia	r1!, {r4 - r7}
 		subs	ip, ip, #16
-		orr	r3, r3, r4, push #8
-		mov	r4, r4, pull #24
-		orr	r4, r4, r5, push #8
-		mov	r5, r5, pull #24
-		orr	r5, r5, r6, push #8
-		mov	r6, r6, pull #24
-		orr	r6, r6, r7, push #8
+		orr	r3, r3, r4, lspush #8
+		mov	r4, r4, lspull #24
+		orr	r4, r4, r5, lspush #8
+		mov	r5, r5, lspull #24
+		orr	r5, r5, r6, lspush #8
+		mov	r6, r6, lspull #24
+		orr	r6, r6, r7, lspush #8
 		stmia	r0!, {r3 - r6}			@ Shouldnt fault
-		bpl	.c2u_3cpy8lp
+		bpl	.Lc2u_3cpy8lp
 
-.c2u_3rem8lp:	tst	ip, #8
-		movne	r3, r7, pull #24
+.Lc2u_3rem8lp:	tst	ip, #8
+		movne	r3, r7, lspull #24
 		ldmneia	r1!, {r4, r7}
-		orrne	r3, r3, r4, push #8
-		movne	r4, r4, pull #24
-		orrne	r4, r4, r7, push #8
+		orrne	r3, r3, r4, lspush #8
+		movne	r4, r4, lspull #24
+		orrne	r4, r4, r7, lspush #8
 		stmneia	r0!, {r3 - r4}			@ Shouldnt fault
 		tst	ip, #4
-		movne	r3, r7, pull #24
+		movne	r3, r7, lspull #24
 		ldrne	r7, [r1], #4
-		orrne	r3, r3, r7, push #8
-		strnet	r3, [r0], #4			@ Shouldnt fault
+		orrne	r3, r3, r7, lspush #8
+	TUSER(	strne) r3, [r0], #4			@ Shouldnt fault
 		ands	ip, ip, #3
-		beq	.c2u_3fupi
-.c2u_3nowords:	mov	r3, r7, get_byte_3
+		beq	.Lc2u_3fupi
+.Lc2u_3nowords:	mov	r3, r7, get_byte_3
 		teq	ip, #0
-		beq	.c2u_finished
+		beq	.Lc2u_finished
 		cmp	ip, #2
-USER(		strbt	r3, [r0], #1)			@ May fault
+USER(	TUSER(	strb)	r3, [r0], #1)			@ May fault
 		ldrgeb	r3, [r1], #1
-USER(		strgebt	r3, [r0], #1)			@ May fault
+USER(	TUSER(	strgeb) r3, [r0], #1)			@ May fault
 		ldrgtb	r3, [r1], #0
-USER(		strgtbt	r3, [r0], #1)			@ May fault
-		b	.c2u_finished
+USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
+		b	.Lc2u_finished
+ENDPROC(__copy_to_user)
 
-		.section .fixup,"ax"
+		.pushsection .fixup,"ax"
 		.align	0
-9001:		LOADREGS(fd,sp!, {r0, r4 - r7, pc})
-		.previous
+9001:		ldmfd	sp!, {r0, r4 - r7, pc}
+		.popsection
 
-/* Prototype: unsigned long __arch_copy_from_user(void *to,const void *from,unsigned long n);
+/* Prototype: unsigned long __copy_from_user(void *to,const void *from,unsigned long n);
  * Purpose  : copy a block from user memory to kernel memory
  * Params   : to   - kernel memory
  *          : from - user memory
  *          : n    - number of bytes to copy
  * Returns  : Number of bytes NOT copied.
  */
-.cfu_dest_not_aligned:
+.Lcfu_dest_not_aligned:
 		rsb	ip, ip, #4
 		cmp	ip, #2
-USER(		ldrbt	r3, [r1], #1)			@ May fault
+USER(	TUSER(	ldrb)	r3, [r1], #1)			@ May fault
 		strb	r3, [r0], #1
-USER(		ldrgebt	r3, [r1], #1)			@ May fault
+USER(	TUSER(	ldrgeb) r3, [r1], #1)			@ May fault
 		strgeb	r3, [r0], #1
-USER(		ldrgtbt	r3, [r1], #1)			@ May fault
+USER(	TUSER(	ldrgtb) r3, [r1], #1)			@ May fault
 		strgtb	r3, [r0], #1
 		sub	r2, r2, ip
-		b	.cfu_dest_aligned
+		b	.Lcfu_dest_aligned
 
-ENTRY(__arch_copy_from_user)
+ENTRY(__copy_from_user)
 		stmfd	sp!, {r0, r2, r4 - r7, lr}
 		cmp	r2, #4
-		blt	.cfu_not_enough
+		blt	.Lcfu_not_enough
 		ands	ip, r0, #3
-		bne	.cfu_dest_not_aligned
-.cfu_dest_aligned:
+		bne	.Lcfu_dest_not_aligned
+.Lcfu_dest_aligned:
 		ands	ip, r1, #3
-		bne	.cfu_src_not_aligned
+		bne	.Lcfu_src_not_aligned
+
 /*
  * Seeing as there has to be at least 8 bytes to copy, we can
  * copy one word, and force a user-mode page fault...
  */
 
-.cfu_0fupi:	subs	r2, r2, #4
+.Lcfu_0fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.cfu_0nowords
-USER(		ldrt	r3, [r1], #4)
+		bmi	.Lcfu_0nowords
+USER(	TUSER(	ldr)	r3, [r1], #4)
 		str	r3, [r0], #4
 		mov	ip, r1, lsl #32 - PAGE_SHIFT	@ On each page, use a ld/st??t instruction
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.cfu_0fupi
+		beq	.Lcfu_0fupi
 /*
  * ip = max no. of bytes to copy before needing another "strt" insn
  */
@@ -332,216 +335,218 @@ USER(		ldrt	r3, [r1], #4)
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #32
-		blt	.cfu_0rem8lp
+		blt	.Lcfu_0rem8lp
 
-.cfu_0cpy8lp:	ldmia	r1!, {r3 - r6}			@ Shouldnt fault
+.Lcfu_0cpy8lp:	ldmia	r1!, {r3 - r6}			@ Shouldnt fault
 		stmia	r0!, {r3 - r6}
 		ldmia	r1!, {r3 - r6}			@ Shouldnt fault
 		subs	ip, ip, #32
 		stmia	r0!, {r3 - r6}
-		bpl	.cfu_0cpy8lp
+		bpl	.Lcfu_0cpy8lp
 
-.cfu_0rem8lp:	cmn	ip, #16
+.Lcfu_0rem8lp:	cmn	ip, #16
 		ldmgeia	r1!, {r3 - r6}			@ Shouldnt fault
 		stmgeia	r0!, {r3 - r6}
 		tst	ip, #8
 		ldmneia	r1!, {r3 - r4}			@ Shouldnt fault
 		stmneia	r0!, {r3 - r4}
 		tst	ip, #4
-		ldrnet	r3, [r1], #4			@ Shouldnt fault
+	TUSER(	ldrne) r3, [r1], #4			@ Shouldnt fault
 		strne	r3, [r0], #4
 		ands	ip, ip, #3
-		beq	.cfu_0fupi
-.cfu_0nowords:	teq	ip, #0
-		beq	.cfu_finished
-.cfu_nowords:	cmp	ip, #2
-USER(		ldrbt	r3, [r1], #1)			@ May fault
+		beq	.Lcfu_0fupi
+.Lcfu_0nowords:	teq	ip, #0
+		beq	.Lcfu_finished
+.Lcfu_nowords:	cmp	ip, #2
+USER(	TUSER(	ldrb)	r3, [r1], #1)			@ May fault
 		strb	r3, [r0], #1
-USER(		ldrgebt	r3, [r1], #1)			@ May fault
+USER(	TUSER(	ldrgeb) r3, [r1], #1)			@ May fault
 		strgeb	r3, [r0], #1
-USER(		ldrgtbt	r3, [r1], #1)			@ May fault
+USER(	TUSER(	ldrgtb) r3, [r1], #1)			@ May fault
 		strgtb	r3, [r0], #1
-		b	.cfu_finished
+		b	.Lcfu_finished
 
-.cfu_not_enough:
+.Lcfu_not_enough:
 		movs	ip, r2
-		bne	.cfu_nowords
-.cfu_finished:	mov	r0, #0
+		bne	.Lcfu_nowords
+.Lcfu_finished:	mov	r0, #0
 		add	sp, sp, #8
-		LOADREGS(fd,sp!,{r4 - r7, pc})
+		ldmfd	sp!, {r4 - r7, pc}
 
-.cfu_src_not_aligned:
+.Lcfu_src_not_aligned:
 		bic	r1, r1, #3
-USER(		ldrt	r7, [r1], #4)			@ May fault
+USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
 		cmp	ip, #2
-		bgt	.cfu_3fupi
-		beq	.cfu_2fupi
-.cfu_1fupi:	subs	r2, r2, #4
+		bgt	.Lcfu_3fupi
+		beq	.Lcfu_2fupi
+.Lcfu_1fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.cfu_1nowords
-		mov	r3, r7, pull #8
-USER(		ldrt	r7, [r1], #4)			@ May fault
-		orr	r3, r3, r7, push #24
+		bmi	.Lcfu_1nowords
+		mov	r3, r7, lspull #8
+USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
+		orr	r3, r3, r7, lspush #24
 		str	r3, [r0], #4
 		mov	ip, r1, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.cfu_1fupi
+		beq	.Lcfu_1fupi
 		cmp	r2, ip
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #16
-		blt	.cfu_1rem8lp
+		blt	.Lcfu_1rem8lp
 
-.cfu_1cpy8lp:	mov	r3, r7, pull #8
+.Lcfu_1cpy8lp:	mov	r3, r7, lspull #8
 		ldmia	r1!, {r4 - r7}			@ Shouldnt fault
 		subs	ip, ip, #16
-		orr	r3, r3, r4, push #24
-		mov	r4, r4, pull #8
-		orr	r4, r4, r5, push #24
-		mov	r5, r5, pull #8
-		orr	r5, r5, r6, push #24
-		mov	r6, r6, pull #8
-		orr	r6, r6, r7, push #24
+		orr	r3, r3, r4, lspush #24
+		mov	r4, r4, lspull #8
+		orr	r4, r4, r5, lspush #24
+		mov	r5, r5, lspull #8
+		orr	r5, r5, r6, lspush #24
+		mov	r6, r6, lspull #8
+		orr	r6, r6, r7, lspush #24
 		stmia	r0!, {r3 - r6}
-		bpl	.cfu_1cpy8lp
+		bpl	.Lcfu_1cpy8lp
 
-.cfu_1rem8lp:	tst	ip, #8
-		movne	r3, r7, pull #8
+.Lcfu_1rem8lp:	tst	ip, #8
+		movne	r3, r7, lspull #8
 		ldmneia	r1!, {r4, r7}			@ Shouldnt fault
-		orrne	r3, r3, r4, push #24
-		movne	r4, r4, pull #8
-		orrne	r4, r4, r7, push #24
+		orrne	r3, r3, r4, lspush #24
+		movne	r4, r4, lspull #8
+		orrne	r4, r4, r7, lspush #24
 		stmneia	r0!, {r3 - r4}
 		tst	ip, #4
-		movne	r3, r7, pull #8
-USER(		ldrnet	r7, [r1], #4)			@ May fault
-		orrne	r3, r3, r7, push #24
+		movne	r3, r7, lspull #8
+USER(	TUSER(	ldrne) r7, [r1], #4)			@ May fault
+		orrne	r3, r3, r7, lspush #24
 		strne	r3, [r0], #4
 		ands	ip, ip, #3
-		beq	.cfu_1fupi
-.cfu_1nowords:	mov	r3, r7, get_byte_1
+		beq	.Lcfu_1fupi
+.Lcfu_1nowords:	mov	r3, r7, get_byte_1
 		teq	ip, #0
-		beq	.cfu_finished
+		beq	.Lcfu_finished
 		cmp	ip, #2
 		strb	r3, [r0], #1
 		movge	r3, r7, get_byte_2
 		strgeb	r3, [r0], #1
 		movgt	r3, r7, get_byte_3
 		strgtb	r3, [r0], #1
-		b	.cfu_finished
+		b	.Lcfu_finished
 
-.cfu_2fupi:	subs	r2, r2, #4
+.Lcfu_2fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.cfu_2nowords
-		mov	r3, r7, pull #16
-USER(		ldrt	r7, [r1], #4)			@ May fault
-		orr	r3, r3, r7, push #16
+		bmi	.Lcfu_2nowords
+		mov	r3, r7, lspull #16
+USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
+		orr	r3, r3, r7, lspush #16
 		str	r3, [r0], #4
 		mov	ip, r1, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.cfu_2fupi
+		beq	.Lcfu_2fupi
 		cmp	r2, ip
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #16
-		blt	.cfu_2rem8lp
+		blt	.Lcfu_2rem8lp
+
 
-.cfu_2cpy8lp:	mov	r3, r7, pull #16
+.Lcfu_2cpy8lp:	mov	r3, r7, lspull #16
 		ldmia	r1!, {r4 - r7}			@ Shouldnt fault
 		subs	ip, ip, #16
-		orr	r3, r3, r4, push #16
-		mov	r4, r4, pull #16
-		orr	r4, r4, r5, push #16
-		mov	r5, r5, pull #16
-		orr	r5, r5, r6, push #16
-		mov	r6, r6, pull #16
-		orr	r6, r6, r7, push #16
+		orr	r3, r3, r4, lspush #16
+		mov	r4, r4, lspull #16
+		orr	r4, r4, r5, lspush #16
+		mov	r5, r5, lspull #16
+		orr	r5, r5, r6, lspush #16
+		mov	r6, r6, lspull #16
+		orr	r6, r6, r7, lspush #16
 		stmia	r0!, {r3 - r6}
-		bpl	.cfu_2cpy8lp
+		bpl	.Lcfu_2cpy8lp
 
-.cfu_2rem8lp:	tst	ip, #8
-		movne	r3, r7, pull #16
+.Lcfu_2rem8lp:	tst	ip, #8
+		movne	r3, r7, lspull #16
 		ldmneia	r1!, {r4, r7}			@ Shouldnt fault
-		orrne	r3, r3, r4, push #16
-		movne	r4, r4, pull #16
-		orrne	r4, r4, r7, push #16
+		orrne	r3, r3, r4, lspush #16
+		movne	r4, r4, lspull #16
+		orrne	r4, r4, r7, lspush #16
 		stmneia	r0!, {r3 - r4}
 		tst	ip, #4
-		movne	r3, r7, pull #16
-USER(		ldrnet	r7, [r1], #4)			@ May fault
-		orrne	r3, r3, r7, push #16
+		movne	r3, r7, lspull #16
+USER(	TUSER(	ldrne) r7, [r1], #4)			@ May fault
+		orrne	r3, r3, r7, lspush #16
 		strne	r3, [r0], #4
 		ands	ip, ip, #3
-		beq	.cfu_2fupi
-.cfu_2nowords:	mov	r3, r7, get_byte_2
+		beq	.Lcfu_2fupi
+.Lcfu_2nowords:	mov	r3, r7, get_byte_2
 		teq	ip, #0
-		beq	.cfu_finished
+		beq	.Lcfu_finished
 		cmp	ip, #2
 		strb	r3, [r0], #1
 		movge	r3, r7, get_byte_3
 		strgeb	r3, [r0], #1
-USER(		ldrgtbt	r3, [r1], #0)			@ May fault
+USER(	TUSER(	ldrgtb) r3, [r1], #0)			@ May fault
 		strgtb	r3, [r0], #1
-		b	.cfu_finished
+		b	.Lcfu_finished
 
-.cfu_3fupi:	subs	r2, r2, #4
+.Lcfu_3fupi:	subs	r2, r2, #4
 		addmi	ip, r2, #4
-		bmi	.cfu_3nowords
-		mov	r3, r7, pull #24
-USER(		ldrt	r7, [r1], #4)			@ May fault
-		orr	r3, r3, r7, push #8
+		bmi	.Lcfu_3nowords
+		mov	r3, r7, lspull #24
+USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
+		orr	r3, r3, r7, lspush #8
 		str	r3, [r0], #4
 		mov	ip, r1, lsl #32 - PAGE_SHIFT
 		rsb	ip, ip, #0
 		movs	ip, ip, lsr #32 - PAGE_SHIFT
-		beq	.cfu_3fupi
+		beq	.Lcfu_3fupi
 		cmp	r2, ip
 		movlt	ip, r2
 		sub	r2, r2, ip
 		subs	ip, ip, #16
-		blt	.cfu_3rem8lp
+		blt	.Lcfu_3rem8lp
 
-.cfu_3cpy8lp:	mov	r3, r7, pull #24
+.Lcfu_3cpy8lp:	mov	r3, r7, lspull #24
 		ldmia	r1!, {r4 - r7}			@ Shouldnt fault
-		orr	r3, r3, r4, push #8
-		mov	r4, r4, pull #24
-		orr	r4, r4, r5, push #8
-		mov	r5, r5, pull #24
-		orr	r5, r5, r6, push #8
-		mov	r6, r6, pull #24
-		orr	r6, r6, r7, push #8
+		orr	r3, r3, r4, lspush #8
+		mov	r4, r4, lspull #24
+		orr	r4, r4, r5, lspush #8
+		mov	r5, r5, lspull #24
+		orr	r5, r5, r6, lspush #8
+		mov	r6, r6, lspull #24
+		orr	r6, r6, r7, lspush #8
 		stmia	r0!, {r3 - r6}
 		subs	ip, ip, #16
-		bpl	.cfu_3cpy8lp
+		bpl	.Lcfu_3cpy8lp
 
-.cfu_3rem8lp:	tst	ip, #8
-		movne	r3, r7, pull #24
+.Lcfu_3rem8lp:	tst	ip, #8
+		movne	r3, r7, lspull #24
 		ldmneia	r1!, {r4, r7}			@ Shouldnt fault
-		orrne	r3, r3, r4, push #8
-		movne	r4, r4, pull #24
-		orrne	r4, r4, r7, push #8
+		orrne	r3, r3, r4, lspush #8
+		movne	r4, r4, lspull #24
+		orrne	r4, r4, r7, lspush #8
 		stmneia	r0!, {r3 - r4}
 		tst	ip, #4
-		movne	r3, r7, pull #24
-USER(		ldrnet	r7, [r1], #4)			@ May fault
-		orrne	r3, r3, r7, push #8
+		movne	r3, r7, lspull #24
+USER(	TUSER(	ldrne) r7, [r1], #4)			@ May fault
+		orrne	r3, r3, r7, lspush #8
 		strne	r3, [r0], #4
 		ands	ip, ip, #3
-		beq	.cfu_3fupi
-.cfu_3nowords:	mov	r3, r7, get_byte_3
+		beq	.Lcfu_3fupi
+.Lcfu_3nowords:	mov	r3, r7, get_byte_3
 		teq	ip, #0
-		beq	.cfu_finished
+		beq	.Lcfu_finished
 		cmp	ip, #2
 		strb	r3, [r0], #1
-USER(		ldrgebt	r3, [r1], #1)			@ May fault
+USER(	TUSER(	ldrgeb) r3, [r1], #1)			@ May fault
 		strgeb	r3, [r0], #1
-USER(		ldrgtbt	r3, [r1], #1)			@ May fault
+USER(	TUSER(	ldrgtb) r3, [r1], #1)			@ May fault
 		strgtb	r3, [r0], #1
-		b	.cfu_finished
+		b	.Lcfu_finished
+ENDPROC(__copy_from_user)
 
-		.section .fixup,"ax"
+		.pushsection .fixup,"ax"
 		.align	0
 		/*
 		 * We took an exception.  r0 contains a pointer to
@@ -554,6 +559,6 @@ USER(		ldrgtbt	r3, [r1], #1)			@ May fault
 		movne	r1, r4
 		blne	__memzero
 		mov	r0, r4
-		LOADREGS(fd,sp!, {r4 - r7, pc})
-		.previous
+		ldmfd	sp!, {r4 - r7, pc}
+		.popsection
 
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
new file mode 100644
index 00000000000..3e58d710013
--- /dev/null
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -0,0 +1,270 @@
+/*
+ *  linux/arch/arm/lib/uaccess_with_memcpy.c
+ *
+ *  Written by: Lennert Buytenhek and Nicolas Pitre
+ *  Copyright (C) 2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+#include <linux/rwsem.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h> /* for in_atomic() */
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <asm/current.h>
+#include <asm/page.h>
+
+static int
+pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
+{
+	unsigned long addr = (unsigned long)_addr;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+	pud_t *pud;
+	spinlock_t *ptl;
+
+	pgd = pgd_offset(current->mm, addr);
+	if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
+		return 0;
+
+	pud = pud_offset(pgd, addr);
+	if (unlikely(pud_none(*pud) || pud_bad(*pud)))
+		return 0;
+
+	pmd = pmd_offset(pud, addr);
+	if (unlikely(pmd_none(*pmd)))
+		return 0;
+
+	/*
+	 * A pmd can be bad if it refers to a HugeTLB or THP page.
+	 *
+	 * Both THP and HugeTLB pages have the same pmd layout
+	 * and should not be manipulated by the pte functions.
+	 *
+	 * Lock the page table for the destination and check
+	 * to see that it's still huge and whether or not we will
+	 * need to fault on write, or if we have a splitting THP.
+	 */
+	if (unlikely(pmd_thp_or_huge(*pmd))) {
+		ptl = &current->mm->page_table_lock;
+		spin_lock(ptl);
+		if (unlikely(!pmd_thp_or_huge(*pmd)
+			|| pmd_hugewillfault(*pmd)
+			|| pmd_trans_splitting(*pmd))) {
+			spin_unlock(ptl);
+			return 0;
+		}
+
+		*ptep = NULL;
+		*ptlp = ptl;
+		return 1;
+	}
+
+	if (unlikely(pmd_bad(*pmd)))
+		return 0;
+
+	pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
+	if (unlikely(!pte_present(*pte) || !pte_young(*pte) ||
+	    !pte_write(*pte) || !pte_dirty(*pte))) {
+		pte_unmap_unlock(pte, ptl);
+		return 0;
+	}
+
+	*ptep = pte;
+	*ptlp = ptl;
+
+	return 1;
+}
+
+static unsigned long noinline
+__copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
+{
+	int atomic;
+
+	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+		memcpy((void *)to, from, n);
+		return 0;
+	}
+
+	/* the mmap semaphore is taken only if not in an atomic context */
+	atomic = in_atomic();
+
+	if (!atomic)
+		down_read(&current->mm->mmap_sem);
+	while (n) {
+		pte_t *pte;
+		spinlock_t *ptl;
+		int tocopy;
+
+		while (!pin_page_for_write(to, &pte, &ptl)) {
+			if (!atomic)
+				up_read(&current->mm->mmap_sem);
+			if (__put_user(0, (char __user *)to))
+				goto out;
+			if (!atomic)
+				down_read(&current->mm->mmap_sem);
+		}
+
+		tocopy = (~(unsigned long)to & ~PAGE_MASK) + 1;
+		if (tocopy > n)
+			tocopy = n;
+
+		memcpy((void *)to, from, tocopy);
+		to += tocopy;
+		from += tocopy;
+		n -= tocopy;
+
+		if (pte)
+			pte_unmap_unlock(pte, ptl);
+		else
+			spin_unlock(ptl);
+	}
+	if (!atomic)
+		up_read(&current->mm->mmap_sem);
+
+out:
+	return n;
+}
+
+unsigned long
+__copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	/*
+	 * This test is stubbed out of the main function above to keep
+	 * the overhead for small copies low by avoiding a large
+	 * register dump on the stack just to reload them right away.
+	 * With frame pointer disabled, tail call optimization kicks in
+	 * as well making this test almost invisible.
+	 */
+	if (n < 64)
+		return __copy_to_user_std(to, from, n);
+	return __copy_to_user_memcpy(to, from, n);
+}
+	
+static unsigned long noinline
+__clear_user_memset(void __user *addr, unsigned long n)
+{
+	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+		memset((void *)addr, 0, n);
+		return 0;
+	}
+
+	down_read(&current->mm->mmap_sem);
+	while (n) {
+		pte_t *pte;
+		spinlock_t *ptl;
+		int tocopy;
+
+		while (!pin_page_for_write(addr, &pte, &ptl)) {
+			up_read(&current->mm->mmap_sem);
+			if (__put_user(0, (char __user *)addr))
+				goto out;
+			down_read(&current->mm->mmap_sem);
+		}
+
+		tocopy = (~(unsigned long)addr & ~PAGE_MASK) + 1;
+		if (tocopy > n)
+			tocopy = n;
+
+		memset((void *)addr, 0, tocopy);
+		addr += tocopy;
+		n -= tocopy;
+
+		if (pte)
+			pte_unmap_unlock(pte, ptl);
+		else
+			spin_unlock(ptl);
+	}
+	up_read(&current->mm->mmap_sem);
+
+out:
+	return n;
+}
+
+unsigned long __clear_user(void __user *addr, unsigned long n)
+{
+	/* See rational for this in __copy_to_user() above. */
+	if (n < 64)
+		return __clear_user_std(addr, n);
+	return __clear_user_memset(addr, n);
+}
+
+#if 0
+
+/*
+ * This code is disabled by default, but kept around in case the chosen
+ * thresholds need to be revalidated.  Some overhead (small but still)
+ * would be implied by a runtime determined variable threshold, and
+ * so far the measurement on concerned targets didn't show a worthwhile
+ * variation.
+ *
+ * Note that a fairly precise sched_clock() implementation is needed
+ * for results to make some sense.
+ */
+
+#include <linux/vmalloc.h>
+
+static int __init test_size_treshold(void)
+{
+	struct page *src_page, *dst_page;
+	void *user_ptr, *kernel_ptr;
+	unsigned long long t0, t1, t2;
+	int size, ret;
+
+	ret = -ENOMEM;
+	src_page = alloc_page(GFP_KERNEL);
+	if (!src_page)
+		goto no_src;
+	dst_page = alloc_page(GFP_KERNEL);
+	if (!dst_page)
+		goto no_dst;
+	kernel_ptr = page_address(src_page);
+	user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__P010));
+	if (!user_ptr)
+		goto no_vmap;
+
+	/* warm up the src page dcache */
+	ret = __copy_to_user_memcpy(user_ptr, kernel_ptr, PAGE_SIZE);
+
+	for (size = PAGE_SIZE; size >= 4; size /= 2) {
+		t0 = sched_clock();
+		ret |= __copy_to_user_memcpy(user_ptr, kernel_ptr, size);
+		t1 = sched_clock();
+		ret |= __copy_to_user_std(user_ptr, kernel_ptr, size);
+		t2 = sched_clock();
+		printk("copy_to_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
+	}
+
+	for (size = PAGE_SIZE; size >= 4; size /= 2) {
+		t0 = sched_clock();
+		ret |= __clear_user_memset(user_ptr, size);
+		t1 = sched_clock();
+		ret |= __clear_user_std(user_ptr, size);
+		t2 = sched_clock();
+		printk("clear_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
+	}
+
+	if (ret)
+		ret = -EFAULT;
+
+	vunmap(user_ptr);
+no_vmap:
+	put_page(dst_page);
+no_dst:
+	put_page(src_page);
+no_src:
+	return ret;
+}
+
+subsys_initcall(test_size_treshold);
+
+#endif
diff --git a/arch/arm/lib/ucmpdi2.S b/arch/arm/lib/ucmpdi2.S
index 112630f93e5..f0df6a91db0 100644
--- a/arch/arm/lib/ucmpdi2.S
+++ b/arch/arm/lib/ucmpdi2.S
@@ -33,3 +33,20 @@ ENTRY(__ucmpdi2)
 	movhi	r0, #2
 	mov	pc, lr
 
+ENDPROC(__ucmpdi2)
+
+#ifdef CONFIG_AEABI
+
+ENTRY(__aeabi_ulcmp)
+
+	cmp	xh, yh
+	cmpeq	xl, yl
+	movlo	r0, #-1
+	moveq	r0, #0
+	movhi	r0, #1
+	mov	pc, lr
+
+ENDPROC(__aeabi_ulcmp)
+
+#endif
+
diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c
new file mode 100644
index 00000000000..2c40aeab3ea
--- /dev/null
+++ b/arch/arm/lib/xor-neon.c
@@ -0,0 +1,46 @@
+/*
+ * linux/arch/arm/lib/xor-neon.c
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/raid/xor.h>
+#include <linux/module.h>
+
+MODULE_LICENSE("GPL");
+
+#ifndef __ARM_NEON__
+#error You should compile this file with '-mfloat-abi=softfp -mfpu=neon'
+#endif
+
+/*
+ * Pull in the reference implementations while instructing GCC (through
+ * -ftree-vectorize) to attempt to exploit implicit parallelism and emit
+ * NEON instructions.
+ */
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC optimize "tree-vectorize"
+#else
+/*
+ * While older versions of GCC do not generate incorrect code, they fail to
+ * recognize the parallel nature of these functions, and emit plain ARM code,
+ * which is known to be slower than the optimized ARM code in asm-arm/xor.h.
+ */
+#warning This code requires at least version 4.6 of GCC
+#endif
+
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#include <asm-generic/xor.h>
+
+struct xor_block_template const xor_block_neon_inner = {
+	.name	= "__inner_neon__",
+	.do_2	= xor_8regs_2,
+	.do_3	= xor_8regs_3,
+	.do_4	= xor_8regs_4,
+	.do_5	= xor_8regs_5,
+};
+EXPORT_SYMBOL(xor_block_neon_inner);