51 files changed, 1236 insertions, 3625 deletions
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 846d1c4374e..3269b023409 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -4,18 +4,16 @@
 asflags-y := -ansi -DST_DIV0=0x02
 ccflags-y := -Werror
 
-lib-$(CONFIG_SPARC32) += mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o
+lib-$(CONFIG_SPARC32) += ashrdi3.o
 lib-$(CONFIG_SPARC32) += memcpy.o memset.o
 lib-y                 += strlen.o
 lib-y                 += checksum_$(BITS).o
 lib-$(CONFIG_SPARC32) += blockops.o
 lib-y                 += memscan_$(BITS).o memcmp.o strncmp_$(BITS).o
-lib-y                 += strncpy_from_user_$(BITS).o strlen_user_$(BITS).o
 lib-$(CONFIG_SPARC32) += divdi3.o udivdi3.o
 lib-$(CONFIG_SPARC32) += copy_user.o locks.o
-lib-y                 += atomic_$(BITS).o
+lib-$(CONFIG_SPARC64) += atomic_64.o
 lib-$(CONFIG_SPARC32) += lshrdi3.o ashldi3.o
-lib-$(CONFIG_SPARC32) += rwsem_32.o
 lib-$(CONFIG_SPARC32) += muldi3.o bitext.o cmpdi2.o
 
 lib-$(CONFIG_SPARC64) += copy_page.o clear_page.o bzero.o
@@ -32,16 +30,18 @@ lib-$(CONFIG_SPARC64) += NGmemcpy.o NGcopy_from_user.o NGcopy_to_user.o
 lib-$(CONFIG_SPARC64) += NGpatch.o NGpage.o NGbzero.o
 
 lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o
-lib-$(CONFIG_SPARC64) +=  NG2patch.o NG2page.o
+lib-$(CONFIG_SPARC64) +=  NG2patch.o
+
+lib-$(CONFIG_SPARC64) += NG4memcpy.o NG4copy_from_user.o NG4copy_to_user.o
+lib-$(CONFIG_SPARC64) +=  NG4patch.o NG4copy_page.o NG4clear_page.o NG4memset.o
 
 lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o
 lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o
 
 lib-$(CONFIG_SPARC64) += copy_in_user.o user_fixup.o memmove.o
-lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o xor.o
+lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o xor.o hweight.o ffs.o
 
-obj-y                 += iomap.o
-obj-$(CONFIG_SPARC32) += atomic32.o
+obj-$(CONFIG_SPARC64) += iomap.o
+obj-$(CONFIG_SPARC32) += atomic32.o ucmpdi2.o
 obj-y                 += ksyms.o
 obj-$(CONFIG_SPARC64) += PeeCeeI.o
-obj-y                 += usercopy.o
diff --git a/arch/sparc/lib/NG2memcpy.S b/arch/sparc/lib/NG2memcpy.S
index 0aed75653b5..30eee6e8a81 100644
--- a/arch/sparc/lib/NG2memcpy.S
+++ b/arch/sparc/lib/NG2memcpy.S
@@ -14,7 +14,7 @@
 #define FPRS_FEF  0x04
 #ifdef MEMCPY_DEBUG
 #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
-		     clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
+		     clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
 #else
 #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
@@ -90,49 +90,49 @@
 	faligndata	%x7, %x8, %f14;
 
 #define FREG_MOVE_1(x0) \
-	fmovd		%x0, %f0;
+	fsrc2		%x0, %f0;
 #define FREG_MOVE_2(x0, x1) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2;
 #define FREG_MOVE_3(x0, x1, x2) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2; \
-	fmovd		%x2, %f4;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4;
 #define FREG_MOVE_4(x0, x1, x2, x3) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2; \
-	fmovd		%x2, %f4; \
-	fmovd		%x3, %f6;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6;
 #define FREG_MOVE_5(x0, x1, x2, x3, x4) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2; \
-	fmovd		%x2, %f4; \
-	fmovd		%x3, %f6; \
-	fmovd		%x4, %f8;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6; \
+	fsrc2		%x4, %f8;
 #define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2; \
-	fmovd		%x2, %f4; \
-	fmovd		%x3, %f6; \
-	fmovd		%x4, %f8; \
-	fmovd		%x5, %f10;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6; \
+	fsrc2		%x4, %f8; \
+	fsrc2		%x5, %f10;
 #define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2; \
-	fmovd		%x2, %f4; \
-	fmovd		%x3, %f6; \
-	fmovd		%x4, %f8; \
-	fmovd		%x5, %f10; \
-	fmovd		%x6, %f12;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6; \
+	fsrc2		%x4, %f8; \
+	fsrc2		%x5, %f10; \
+	fsrc2		%x6, %f12;
 #define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2; \
-	fmovd		%x2, %f4; \
-	fmovd		%x3, %f6; \
-	fmovd		%x4, %f8; \
-	fmovd		%x5, %f10; \
-	fmovd		%x6, %f12; \
-	fmovd		%x7, %f14;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6; \
+	fsrc2		%x4, %f8; \
+	fsrc2		%x5, %f10; \
+	fsrc2		%x6, %f12; \
+	fsrc2		%x7, %f14;
 #define FREG_LOAD_1(base, x0) \
 	EX_LD(LOAD(ldd, base + 0x00, %x0))
 #define FREG_LOAD_2(base, x0, x1) \
@@ -182,13 +182,13 @@ FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
 	cmp		%g2, 0
 	tne		%xcc, 5
 	PREAMBLE
-	mov		%o0, GLOBAL_SPARE
+	mov		%o0, %o3
 	cmp		%o2, 0
 	be,pn		%XCC, 85f
-	 or		%o0, %o1, %o3
+	 or		%o0, %o1, GLOBAL_SPARE
 	cmp		%o2, 16
 	blu,a,pn	%XCC, 80f
-	 or		%o3, %o2, %o3
+	 or		GLOBAL_SPARE, %o2, GLOBAL_SPARE
 
 	/* 2 blocks (128 bytes) is the minimum we can do the block
 	 * copy with.  We need to ensure that we'll iterate at least
@@ -202,7 +202,7 @@ FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
 	 */
 	cmp		%o2, (4 * 64)
 	blu,pt		%XCC, 75f
-	 andcc		%o3, 0x7, %g0
+	 andcc		GLOBAL_SPARE, 0x7, %g0
 
 	/* %o0:	dst
 	 * %o1:	src
@@ -236,6 +236,7 @@ FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
 	 */
 	VISEntryHalf
 
+	membar		#Sync
 	alignaddr	%o1, %g0, %g0
 
 	add		%o1, (64 - 1), %o4
@@ -404,13 +405,13 @@ FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
 	 * over. If anything is left, we copy it one byte at a time.
 	 */
 	brz,pt		%o2, 85f
-	 sub		%o0, %o1, %o3
+	 sub		%o0, %o1, GLOBAL_SPARE
 	ba,a,pt		%XCC, 90f
 
 	.align		64
 75: /* 16 < len <= 64 */
 	bne,pn		%XCC, 75f
-	 sub		%o0, %o1, %o3
+	 sub		%o0, %o1, GLOBAL_SPARE
 
 72:
 	andn		%o2, 0xf, %o4
@@ -420,9 +421,9 @@ FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
 	add		%o1, 0x08, %o1
 	EX_LD(LOAD(ldx, %o1, %g1))
 	sub		%o1, 0x08, %o1
-	EX_ST(STORE(stx, %o5, %o1 + %o3))
+	EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE))
 	add		%o1, 0x8, %o1
-	EX_ST(STORE(stx, %g1, %o1 + %o3))
+	EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE))
 	bgu,pt		%XCC, 1b
 	 add		%o1, 0x8, %o1
 73:	andcc		%o2, 0x8, %g0
@@ -430,14 +431,14 @@ FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
 	 nop
 	sub		%o2, 0x8, %o2
 	EX_LD(LOAD(ldx, %o1, %o5))
-	EX_ST(STORE(stx, %o5, %o1 + %o3))
+	EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE))
 	add		%o1, 0x8, %o1
 1:	andcc		%o2, 0x4, %g0
 	be,pt		%XCC, 1f
 	 nop
 	sub		%o2, 0x4, %o2
 	EX_LD(LOAD(lduw, %o1, %o5))
-	EX_ST(STORE(stw, %o5, %o1 + %o3))
+	EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE))
 	add		%o1, 0x4, %o1
 1:	cmp		%o2, 0
 	be,pt		%XCC, 85f
@@ -454,11 +455,11 @@ FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
 
 1:	subcc		%g1, 1, %g1
 	EX_LD(LOAD(ldub, %o1, %o5))
-	EX_ST(STORE(stb, %o5, %o1 + %o3))
+	EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE))
 	bgu,pt		%icc, 1b
 	 add		%o1, 1, %o1
 
-2:	add		%o1, %o3, %o0
+2:	add		%o1, GLOBAL_SPARE, %o0
 	andcc		%o1, 0x7, %g1
 	bne,pt		%icc, 8f
 	 sll		%g1, 3, %g1
@@ -468,16 +469,16 @@ FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
 	 nop
 	ba,a,pt		%xcc, 73b
 
-8:	mov		64, %o3
+8:	mov		64, GLOBAL_SPARE
 	andn		%o1, 0x7, %o1
 	EX_LD(LOAD(ldx, %o1, %g2))
-	sub		%o3, %g1, %o3
+	sub		GLOBAL_SPARE, %g1, GLOBAL_SPARE
 	andn		%o2, 0x7, %o4
 	sllx		%g2, %g1, %g2
 1:	add		%o1, 0x8, %o1
 	EX_LD(LOAD(ldx, %o1, %g3))
 	subcc		%o4, 0x8, %o4
-	srlx		%g3, %o3, %o5
+	srlx		%g3, GLOBAL_SPARE, %o5
 	or		%o5, %g2, %o5
 	EX_ST(STORE(stx, %o5, %o0))
 	add		%o0, 0x8, %o0
@@ -489,32 +490,32 @@ FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
 	be,pn		%icc, 85f
 	 add		%o1, %g1, %o1
 	ba,pt		%xcc, 90f
-	 sub		%o0, %o1, %o3
+	 sub		%o0, %o1, GLOBAL_SPARE
 
 	.align		64
 80: /* 0 < len <= 16 */
-	andcc		%o3, 0x3, %g0
+	andcc		GLOBAL_SPARE, 0x3, %g0
 	bne,pn		%XCC, 90f
-	 sub		%o0, %o1, %o3
+	 sub		%o0, %o1, GLOBAL_SPARE
 
 1:
 	subcc		%o2, 4, %o2
 	EX_LD(LOAD(lduw, %o1, %g1))
-	EX_ST(STORE(stw, %g1, %o1 + %o3))
+	EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE))
 	bgu,pt		%XCC, 1b
 	 add		%o1, 4, %o1
 
 85:	retl
-	 mov		EX_RETVAL(GLOBAL_SPARE), %o0
+	 mov		EX_RETVAL(%o3), %o0
 
 	.align		32
 90:
 	subcc		%o2, 1, %o2
 	EX_LD(LOAD(ldub, %o1, %g1))
-	EX_ST(STORE(stb, %g1, %o1 + %o3))
+	EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE))
 	bgu,pt		%XCC, 90b
 	 add		%o1, 1, %o1
 	retl
-	 mov		EX_RETVAL(GLOBAL_SPARE), %o0
+	 mov		EX_RETVAL(%o3), %o0
 
 	.size		FUNC_NAME, .-FUNC_NAME
diff --git a/arch/sparc/lib/NG2page.S b/arch/sparc/lib/NG2page.S
deleted file mode 100644
index 73b6b7c72cb..00000000000
--- a/arch/sparc/lib/NG2page.S
+++ /dev/null
@@ -1,61 +0,0 @@
-/* NG2page.S: Niagara-2 optimized clear and copy page.
- *
- * Copyright (C) 2007 (davem@davemloft.net)
- */
-
-#include <asm/asi.h>
-#include <asm/page.h>
-#include <asm/visasm.h>
-
-	.text
-	.align	32
-
-	/* This is heavily simplified from the sun4u variants
-	 * because Niagara-2 does not have any D-cache aliasing issues.
-	 */
-NG2copy_user_page:	/* %o0=dest, %o1=src, %o2=vaddr */
-	prefetch	[%o1 + 0x00], #one_read
-	prefetch	[%o1 + 0x40], #one_read
-	VISEntryHalf
-	set		PAGE_SIZE, %g7
-	sub		%o0, %o1, %g3
-1:	stxa		%g0, [%o1 + %g3] ASI_BLK_INIT_QUAD_LDD_P
-	subcc		%g7, 64, %g7
-	ldda		[%o1] ASI_BLK_P, %f0
-	stda		%f0, [%o1 + %g3] ASI_BLK_P
-	add		%o1, 64, %o1
-	bne,pt		%xcc, 1b
-	 prefetch	[%o1 + 0x40], #one_read
-	membar		#Sync
-	VISExitHalf
-	retl
-	 nop
-
-#define BRANCH_ALWAYS	0x10680000
-#define NOP		0x01000000
-#define NG_DO_PATCH(OLD, NEW)	\
-	sethi	%hi(NEW), %g1; \
-	or	%g1, %lo(NEW), %g1; \
-	sethi	%hi(OLD), %g2; \
-	or	%g2, %lo(OLD), %g2; \
-	sub	%g1, %g2, %g1; \
-	sethi	%hi(BRANCH_ALWAYS), %g3; \
-	sll	%g1, 11, %g1; \
-	srl	%g1, 11 + 2, %g1; \
-	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
-	or	%g3, %g1, %g3; \
-	stw	%g3, [%g2]; \
-	sethi	%hi(NOP), %g3; \
-	or	%g3, %lo(NOP), %g3; \
-	stw	%g3, [%g2 + 0x4]; \
-	flush	%g2;
-
-	.globl	niagara2_patch_pageops
-	.type	niagara2_patch_pageops,#function
-niagara2_patch_pageops:
-	NG_DO_PATCH(copy_user_page, NG2copy_user_page)
-	NG_DO_PATCH(_clear_page, NGclear_page)
-	NG_DO_PATCH(clear_user_page, NGclear_user_page)
-	retl
-	 nop
-	.size	niagara2_patch_pageops,.-niagara2_patch_pageops
diff --git a/arch/sparc/lib/NG4clear_page.S b/arch/sparc/lib/NG4clear_page.S
new file mode 100644
index 00000000000..e16c88204a4
--- /dev/null
+++ b/arch/sparc/lib/NG4clear_page.S
@@ -0,0 +1,29 @@
+/* NG4copy_page.S: Niagara-4 optimized clear page.
+ *
+ * Copyright (C) 2012 (davem@davemloft.net)
+ */
+
+#include <asm/asi.h>
+#include <asm/page.h>
+
+	.text
+
+	.register	%g3, #scratch
+
+	.align		32
+	.globl		NG4clear_page
+	.globl		NG4clear_user_page
+NG4clear_page:		/* %o0=dest */
+NG4clear_user_page:	/* %o0=dest, %o1=vaddr */
+	set		PAGE_SIZE, %g7
+	mov		0x20, %g3
+1:	stxa		%g0, [%o0 + %g0] ASI_ST_BLKINIT_MRU_P
+	subcc		%g7, 0x40, %g7
+	stxa		%g0, [%o0 + %g3] ASI_ST_BLKINIT_MRU_P
+	bne,pt		%xcc, 1b
+	 add		%o0, 0x40, %o0
+	membar		#StoreLoad|#StoreStore
+	retl
+	 nop
+	.size		NG4clear_page,.-NG4clear_page
+	.size		NG4clear_user_page,.-NG4clear_user_page
+\ No newline at end of file
diff --git a/arch/sparc/lib/NG4copy_from_user.S b/arch/sparc/lib/NG4copy_from_user.S
new file mode 100644
index 00000000000..fd9f903ffa3
--- /dev/null
+++ b/arch/sparc/lib/NG4copy_from_user.S
@@ -0,0 +1,30 @@
+/* NG4copy_from_user.S: Niagara-4 optimized copy from userspace.
+ *
+ * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
+ */
+
+#define EX_LD(x)		\
+98:	x;			\
+	.section __ex_table,"a";\
+	.align 4;		\
+	.word 98b, __retl_one_asi;\
+	.text;			\
+	.align 4;
+
+#ifndef ASI_AIUS
+#define ASI_AIUS	0x11
+#endif
+
+#define FUNC_NAME		NG4copy_from_user
+#define LOAD(type,addr,dest)	type##a [addr] %asi, dest
+#define EX_RETVAL(x)		0
+
+#ifdef __KERNEL__
+#define PREAMBLE					\
+	rd		%asi, %g1;			\
+	cmp		%g1, ASI_AIUS;			\
+	bne,pn		%icc, ___copy_in_user;		\
+	 nop
+#endif
+
+#include "NG4memcpy.S"
diff --git a/arch/sparc/lib/NG4copy_page.S b/arch/sparc/lib/NG4copy_page.S
new file mode 100644
index 00000000000..28504e88c53
--- /dev/null
+++ b/arch/sparc/lib/NG4copy_page.S
@@ -0,0 +1,57 @@
+/* NG4copy_page.S: Niagara-4 optimized copy page.
+ *
+ * Copyright (C) 2012 (davem@davemloft.net)
+ */
+
+#include <asm/asi.h>
+#include <asm/page.h>
+
+	.text
+	.align		32
+
+	.register	%g2, #scratch
+	.register	%g3, #scratch
+
+	.globl		NG4copy_user_page
+NG4copy_user_page:	/* %o0=dest, %o1=src, %o2=vaddr */
+	prefetch	[%o1 + 0x000], #n_reads_strong
+	prefetch	[%o1 + 0x040], #n_reads_strong
+	prefetch	[%o1 + 0x080], #n_reads_strong
+	prefetch	[%o1 + 0x0c0], #n_reads_strong
+	set		PAGE_SIZE, %g7
+	prefetch	[%o1 + 0x100], #n_reads_strong
+	prefetch	[%o1 + 0x140], #n_reads_strong
+	prefetch	[%o1 + 0x180], #n_reads_strong
+	prefetch	[%o1 + 0x1c0], #n_reads_strong
+1:
+	ldx		[%o1 + 0x00], %o2
+	subcc		%g7, 0x40, %g7
+	ldx		[%o1 + 0x08], %o3
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	ldx		[%o1 + 0x20], %g1
+	stxa		%o2, [%o0] ASI_ST_BLKINIT_MRU_P
+	add		%o0, 0x08, %o0
+	ldx		[%o1 + 0x28], %g2
+	stxa		%o3, [%o0] ASI_ST_BLKINIT_MRU_P
+	add		%o0, 0x08, %o0
+	ldx		[%o1 + 0x30], %g3
+	stxa		%o4, [%o0] ASI_ST_BLKINIT_MRU_P
+	add		%o0, 0x08, %o0
+	ldx		[%o1 + 0x38], %o2
+	add		%o1, 0x40, %o1
+	stxa		%o5, [%o0] ASI_ST_BLKINIT_MRU_P
+	add		%o0, 0x08, %o0
+	stxa		%g1, [%o0] ASI_ST_BLKINIT_MRU_P
+	add		%o0, 0x08, %o0
+	stxa		%g2, [%o0] ASI_ST_BLKINIT_MRU_P
+	add		%o0, 0x08, %o0
+	stxa		%g3, [%o0] ASI_ST_BLKINIT_MRU_P
+	add		%o0, 0x08, %o0
+	stxa		%o2, [%o0] ASI_ST_BLKINIT_MRU_P
+	add		%o0, 0x08, %o0
+	bne,pt		%icc, 1b
+	 prefetch	[%o1 + 0x200], #n_reads_strong
+	retl
+	 membar		#StoreLoad | #StoreStore
+	.size		NG4copy_user_page,.-NG4copy_user_page
diff --git a/arch/sparc/lib/NG4copy_to_user.S b/arch/sparc/lib/NG4copy_to_user.S
new file mode 100644
index 00000000000..9744c4540a8
--- /dev/null
+++ b/arch/sparc/lib/NG4copy_to_user.S
@@ -0,0 +1,39 @@
+/* NG4copy_to_user.S: Niagara-4 optimized copy to userspace.
+ *
+ * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
+ */
+
+#define EX_ST(x)		\
+98:	x;			\
+	.section __ex_table,"a";\
+	.align 4;		\
+	.word 98b, __retl_one_asi;\
+	.text;			\
+	.align 4;
+
+#ifndef ASI_AIUS
+#define ASI_AIUS	0x11
+#endif
+
+#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS
+#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23
+#endif
+
+#define FUNC_NAME		NG4copy_to_user
+#define STORE(type,src,addr)	type##a src, [addr] %asi
+#define STORE_ASI		ASI_BLK_INIT_QUAD_LDD_AIUS
+#define EX_RETVAL(x)		0
+
+#ifdef __KERNEL__
+	/* Writing to %asi is _expensive_ so we hardcode it.
+	 * Reading %asi to check for KERNEL_DS is comparatively
+	 * cheap.
+	 */
+#define PREAMBLE					\
+	rd		%asi, %g1;			\
+	cmp		%g1, ASI_AIUS;			\
+	bne,pn		%icc, ___copy_in_user;		\
+	 nop
+#endif
+
+#include "NG4memcpy.S"
diff --git a/arch/sparc/lib/NG4memcpy.S b/arch/sparc/lib/NG4memcpy.S
new file mode 100644
index 00000000000..9cf2ee01cee
--- /dev/null
+++ b/arch/sparc/lib/NG4memcpy.S
@@ -0,0 +1,360 @@
+/* NG4memcpy.S: Niagara-4 optimized memcpy.
+ *
+ * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
+ */
+
+#ifdef __KERNEL__
+#include <asm/visasm.h>
+#include <asm/asi.h>
+#define GLOBAL_SPARE	%g7
+#else
+#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
+#define FPRS_FEF  0x04
+
+/* On T4 it is very expensive to access ASRs like %fprs and
+ * %asi, avoiding a read or a write can save ~50 cycles.
+ */
+#define FPU_ENTER			\
+	rd	%fprs, %o5;		\
+	andcc	%o5, FPRS_FEF, %g0;	\
+	be,a,pn	%icc, 999f;		\
+	 wr	%g0, FPRS_FEF, %fprs;	\
+	999:
+
+#ifdef MEMCPY_DEBUG
+#define VISEntryHalf FPU_ENTER; \
+		     clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#else
+#define VISEntryHalf FPU_ENTER
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#endif
+
+#define GLOBAL_SPARE	%g5
+#endif
+
+#ifndef STORE_ASI
+#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
+#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
+#else
+#define STORE_ASI	0x80		/* ASI_P */
+#endif
+#endif
+
+#ifndef EX_LD
+#define EX_LD(x)	x
+#endif
+
+#ifndef EX_ST
+#define EX_ST(x)	x
+#endif
+
+#ifndef EX_RETVAL
+#define EX_RETVAL(x)	x
+#endif
+
+#ifndef LOAD
+#define LOAD(type,addr,dest)	type [addr], dest
+#endif
+
+#ifndef STORE
+#ifndef MEMCPY_DEBUG
+#define STORE(type,src,addr)	type src, [addr]
+#else
+#define STORE(type,src,addr)	type##a src, [addr] %asi
+#endif
+#endif
+
+#ifndef STORE_INIT
+#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
+#endif
+
+#ifndef FUNC_NAME
+#define FUNC_NAME	NG4memcpy
+#endif
+#ifndef PREAMBLE
+#define PREAMBLE
+#endif
+
+#ifndef XCC
+#define XCC xcc
+#endif
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+	.text
+	.align		64
+
+	.globl	FUNC_NAME
+	.type	FUNC_NAME,#function
+FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
+#ifdef MEMCPY_DEBUG
+	wr		%g0, 0x80, %asi
+#endif
+	srlx		%o2, 31, %g2
+	cmp		%g2, 0
+	tne		%XCC, 5
+	PREAMBLE
+	mov		%o0, %o3
+	brz,pn		%o2, .Lexit
+	 cmp		%o2, 3
+	ble,pn		%icc, .Ltiny
+	 cmp		%o2, 19
+	ble,pn		%icc, .Lsmall
+	 or		%o0, %o1, %g2
+	cmp		%o2, 128
+	bl,pn		%icc, .Lmedium
+	 nop
+
+.Llarge:/* len >= 0x80 */
+	/* First get dest 8 byte aligned.  */
+	sub		%g0, %o0, %g1
+	and		%g1, 0x7, %g1
+	brz,pt		%g1, 51f
+	 sub		%o2, %g1, %o2
+
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
+	add		%o1, 1, %o1
+	subcc		%g1, 1, %g1
+	add		%o0, 1, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stb, %g2, %o0 - 0x01))
+
+51:	LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
+
+	/* Check if we can use the straight fully aligned
+	 * loop, or we require the alignaddr/faligndata variant.
+	 */
+	andcc		%o1, 0x7, %o5
+	bne,pn		%icc, .Llarge_src_unaligned
+	 sub		%g0, %o0, %g1
+
+	/* Legitimize the use of initializing stores by getting dest
+	 * to be 64-byte aligned.
+	 */
+	and		%g1, 0x3f, %g1
+	brz,pt		%g1, .Llarge_aligned
+	 sub		%o2, %g1, %o2
+
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
+	add		%o1, 8, %o1
+	subcc		%g1, 8, %g1
+	add		%o0, 8, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stx, %g2, %o0 - 0x08))
+
+.Llarge_aligned:
+	/* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
+	andn		%o2, 0x3f, %o4
+	sub		%o2, %o4, %o2
+
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+	add		%o1, 0x40, %o1
+	EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
+	subcc		%o4, 0x40, %o4
+	EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
+	EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
+	EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
+	EX_ST(STORE_INIT(%g1, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(%g2, %o0))
+	add		%o0, 0x08, %o0
+	EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
+	EX_ST(STORE_INIT(%g3, %o0))
+	add		%o0, 0x08, %o0
+	EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
+	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
+	add		%o0, 0x08, %o0
+	EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
+	EX_ST(STORE_INIT(%o5, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(%g2, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(%g3, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
+	add		%o0, 0x08, %o0
+	bne,pt		%icc, 1b
+	 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
+
+	membar		#StoreLoad | #StoreStore
+
+	brz,pn		%o2, .Lexit
+	 cmp		%o2, 19
+	ble,pn		%icc, .Lsmall_unaligned
+	 nop
+	ba,a,pt		%icc, .Lmedium_noprefetch
+
+.Lexit:	retl
+	 mov		EX_RETVAL(%o3), %o0
+
+.Llarge_src_unaligned:
+	andn		%o2, 0x3f, %o4
+	sub		%o2, %o4, %o2
+	VISEntryHalf
+	alignaddr	%o1, %g0, %g1
+	add		%o1, %o4, %o1
+	EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
+1:	EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
+	subcc		%o4, 0x40, %o4
+	EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
+	EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
+	EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
+	EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
+	EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
+	EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
+	faligndata	%f0, %f2, %f16
+	EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
+	faligndata	%f2, %f4, %f18
+	add		%g1, 0x40, %g1
+	faligndata	%f4, %f6, %f20
+	faligndata	%f6, %f8, %f22
+	faligndata	%f8, %f10, %f24
+	faligndata	%f10, %f12, %f26
+	faligndata	%f12, %f14, %f28
+	faligndata	%f14, %f0, %f30
+	EX_ST(STORE(std, %f16, %o0 + 0x00))
+	EX_ST(STORE(std, %f18, %o0 + 0x08))
+	EX_ST(STORE(std, %f20, %o0 + 0x10))
+	EX_ST(STORE(std, %f22, %o0 + 0x18))
+	EX_ST(STORE(std, %f24, %o0 + 0x20))
+	EX_ST(STORE(std, %f26, %o0 + 0x28))
+	EX_ST(STORE(std, %f28, %o0 + 0x30))
+	EX_ST(STORE(std, %f30, %o0 + 0x38))
+	add		%o0, 0x40, %o0
+	bne,pt		%icc, 1b
+	 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
+	VISExitHalf
+
+	brz,pn		%o2, .Lexit
+	 cmp		%o2, 19
+	ble,pn		%icc, .Lsmall_unaligned
+	 nop
+	ba,a,pt		%icc, .Lmedium_unaligned
+
+.Lmedium:
+	LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
+	andcc		%g2, 0x7, %g0
+	bne,pn		%icc, .Lmedium_unaligned
+	 nop
+.Lmedium_noprefetch:
+	andncc		%o2, 0x20 - 1, %o5
+	be,pn		%icc, 2f
+	 sub		%o2, %o5, %o2
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+	EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
+	EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
+	EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
+	add		%o1, 0x20, %o1
+	subcc		%o5, 0x20, %o5
+	EX_ST(STORE(stx, %g1, %o0 + 0x00))
+	EX_ST(STORE(stx, %g2, %o0 + 0x08))
+	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
+	EX_ST(STORE(stx, %o4, %o0 + 0x18))
+	bne,pt		%icc, 1b
+	 add		%o0, 0x20, %o0
+2:	andcc		%o2, 0x18, %o5
+	be,pt		%icc, 3f
+	 sub		%o2, %o5, %o2
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+	add		%o1, 0x08, %o1
+	add		%o0, 0x08, %o0
+	subcc		%o5, 0x08, %o5
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stx, %g1, %o0 - 0x08))
+3:	brz,pt		%o2, .Lexit
+	 cmp		%o2, 0x04
+	bl,pn		%icc, .Ltiny
+	 nop
+	EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
+	add		%o1, 0x04, %o1
+	add		%o0, 0x04, %o0
+	subcc		%o2, 0x04, %o2
+	bne,pn		%icc, .Ltiny
+	 EX_ST(STORE(stw, %g1, %o0 - 0x04))
+	ba,a,pt		%icc, .Lexit
+.Lmedium_unaligned:
+	/* First get dest 8 byte aligned.  */
+	sub		%g0, %o0, %g1
+	and		%g1, 0x7, %g1
+	brz,pt		%g1, 2f
+	 sub		%o2, %g1, %o2
+
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
+	add		%o1, 1, %o1
+	subcc		%g1, 1, %g1
+	add		%o0, 1, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stb, %g2, %o0 - 0x01))
+2:
+	and		%o1, 0x7, %g1
+	brz,pn		%g1, .Lmedium_noprefetch
+	 sll		%g1, 3, %g1
+	mov		64, %g2
+	sub		%g2, %g1, %g2
+	andn		%o1, 0x7, %o1
+	EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
+	sllx		%o4, %g1, %o4
+	andn		%o2, 0x08 - 1, %o5
+	sub		%o2, %o5, %o2
+1:	EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
+	add		%o1, 0x08, %o1
+	subcc		%o5, 0x08, %o5
+	srlx		%g3, %g2, GLOBAL_SPARE
+	or		GLOBAL_SPARE, %o4, GLOBAL_SPARE
+	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
+	add		%o0, 0x08, %o0
+	bne,pt		%icc, 1b
+	 sllx		%g3, %g1, %o4
+	srl		%g1, 3, %g1
+	add		%o1, %g1, %o1
+	brz,pn		%o2, .Lexit
+	 nop
+	ba,pt		%icc, .Lsmall_unaligned
+
+.Ltiny:
+	EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
+	subcc		%o2, 1, %o2
+	be,pn		%icc, .Lexit
+	 EX_ST(STORE(stb, %g1, %o0 + 0x00))
+	EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
+	subcc		%o2, 1, %o2
+	be,pn		%icc, .Lexit
+	 EX_ST(STORE(stb, %g1, %o0 + 0x01))
+	EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
+	ba,pt		%icc, .Lexit
+	 EX_ST(STORE(stb, %g1, %o0 + 0x02))
+
+.Lsmall:
+	andcc		%g2, 0x3, %g0
+	bne,pn		%icc, .Lsmall_unaligned
+	 andn		%o2, 0x4 - 1, %o5
+	sub		%o2, %o5, %o2
+1:
+	EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
+	add		%o1, 0x04, %o1
+	subcc		%o5, 0x04, %o5
+	add		%o0, 0x04, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stw, %g1, %o0 - 0x04))
+	brz,pt		%o2, .Lexit
+	 nop
+	ba,a,pt		%icc, .Ltiny
+
+.Lsmall_unaligned:
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
+	add		%o1, 1, %o1
+	add		%o0, 1, %o0
+	subcc		%o2, 1, %o2
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stb, %g1, %o0 - 0x01))
+	ba,a,pt		%icc, .Lexit
+	.size		FUNC_NAME, .-FUNC_NAME
diff --git a/arch/sparc/lib/NG4memset.S b/arch/sparc/lib/NG4memset.S
new file mode 100644
index 00000000000..41da4bdd95c
--- /dev/null
+++ b/arch/sparc/lib/NG4memset.S
@@ -0,0 +1,105 @@
+/* NG4memset.S: Niagara-4 optimized memset/bzero.
+ *
+ * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
+ */
+
+#include <asm/asi.h>
+
+	.register	%g2, #scratch
+	.register	%g3, #scratch
+
+	.text
+	.align		32
+	.globl		NG4memset
+NG4memset:
+	andcc		%o1, 0xff, %o4
+	be,pt		%icc, 1f
+	 mov		%o2, %o1
+	sllx		%o4, 8, %g1
+	or		%g1, %o4, %o2
+	sllx		%o2, 16, %g1
+	or		%g1, %o2, %o2
+	sllx		%o2, 32, %g1
+	ba,pt		%icc, 1f
+	 or		%g1, %o2, %o4
+	.size		NG4memset,.-NG4memset
+
+	.align		32
+	.globl		NG4bzero
+NG4bzero:
+	clr		%o4
+1:	cmp		%o1, 16
+	ble		%icc, .Ltiny
+	 mov		%o0, %o3
+	sub		%g0, %o0, %g1
+	and		%g1, 0x7, %g1
+	brz,pt		%g1, .Laligned8
+	 sub		%o1, %g1, %o1
+1:	stb		%o4, [%o0 + 0x00]
+	subcc		%g1, 1, %g1
+	bne,pt		%icc, 1b
+	 add		%o0, 1, %o0
+.Laligned8:
+	cmp		%o1, 64 + (64 - 8)
+	ble		.Lmedium
+	 sub		%g0, %o0, %g1
+	andcc		%g1, (64 - 1), %g1
+	brz,pn		%g1, .Laligned64
+	 sub		%o1, %g1, %o1
+1:	stx		%o4, [%o0 + 0x00]
+	subcc		%g1, 8, %g1
+	bne,pt		%icc, 1b
+	 add		%o0, 0x8, %o0
+.Laligned64:
+	andn		%o1, 64 - 1, %g1
+	sub		%o1, %g1, %o1
+	brnz,pn		%o4, .Lnon_bzero_loop
+	 mov		0x20, %g2
+1:	stxa		%o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	subcc		%g1, 0x40, %g1
+	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	bne,pt		%icc, 1b
+	 add		%o0, 0x40, %o0
+.Lpostloop:
+	cmp		%o1, 8
+	bl,pn		%icc, .Ltiny
+	 membar		#StoreStore|#StoreLoad
+.Lmedium:
+	andn		%o1, 0x7, %g1
+	sub		%o1, %g1, %o1
+1:	stx		%o4, [%o0 + 0x00]
+	subcc		%g1, 0x8, %g1
+	bne,pt		%icc, 1b
+	 add		%o0, 0x08, %o0
+	andcc		%o1, 0x4, %g1
+	be,pt		%icc, .Ltiny
+	 sub		%o1, %g1, %o1
+	stw		%o4, [%o0 + 0x00]
+	add		%o0, 0x4, %o0
+.Ltiny:
+	cmp		%o1, 0
+	be,pn		%icc, .Lexit
+1:	 subcc		%o1, 1, %o1
+	stb		%o4, [%o0 + 0x00]
+	bne,pt		%icc, 1b
+	 add		%o0, 1, %o0
+.Lexit:
+	retl
+	 mov		%o3, %o0
+.Lnon_bzero_loop:
+	mov		0x08, %g3
+	mov		0x28, %o5
+1:	stxa		%o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	subcc		%g1, 0x40, %g1
+	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x10, %o0
+	stxa		%o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P
+	bne,pt		%icc, 1b
+	 add		%o0, 0x30, %o0
+	ba,a,pt		%icc, .Lpostloop
+	.size		NG4bzero,.-NG4bzero
diff --git a/arch/sparc/lib/NG4patch.S b/arch/sparc/lib/NG4patch.S
new file mode 100644
index 00000000000..a114cbcf2a4
--- /dev/null
+++ b/arch/sparc/lib/NG4patch.S
@@ -0,0 +1,54 @@
+/* NG4patch.S: Patch Ultra-I routines with Niagara-4 variant.
+ *
+ * Copyright (C) 2012 David S. Miller <davem@davemloft.net>
+ */
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define NG_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	niagara4_patch_copyops
+	.type	niagara4_patch_copyops,#function
+niagara4_patch_copyops:
+	NG_DO_PATCH(memcpy, NG4memcpy)
+	NG_DO_PATCH(___copy_from_user, NG4copy_from_user)
+	NG_DO_PATCH(___copy_to_user, NG4copy_to_user)
+	retl
+	 nop
+	.size	niagara4_patch_copyops,.-niagara4_patch_copyops
+
+	.globl	niagara4_patch_bzero
+	.type	niagara4_patch_bzero,#function
+niagara4_patch_bzero:
+	NG_DO_PATCH(memset, NG4memset)
+	NG_DO_PATCH(__bzero, NG4bzero)
+	NG_DO_PATCH(__clear_user, NGclear_user)
+	NG_DO_PATCH(tsb_init, NGtsb_init)
+	retl
+	 nop
+	.size	niagara4_patch_bzero,.-niagara4_patch_bzero
+
+	.globl	niagara4_patch_pageops
+	.type	niagara4_patch_pageops,#function
+niagara4_patch_pageops:
+	NG_DO_PATCH(copy_user_page, NG4copy_user_page)
+	NG_DO_PATCH(_clear_page, NG4clear_page)
+	NG_DO_PATCH(clear_user_page, NG4clear_user_page)
+	retl
+	 nop
+	.size	niagara4_patch_pageops,.-niagara4_patch_pageops
diff --git a/arch/sparc/lib/NGpage.S b/arch/sparc/lib/NGpage.S
index 428920de05b..423d46e2258 100644
--- a/arch/sparc/lib/NGpage.S
+++ b/arch/sparc/lib/NGpage.S
@@ -16,55 +16,93 @@
 	 */
 
 NGcopy_user_page:	/* %o0=dest, %o1=src, %o2=vaddr */
-	prefetch	[%o1 + 0x00], #one_read
-	mov		8, %g1
-	mov		16, %g2
-	mov		24, %g3
+	save		%sp, -192, %sp
+	rd		%asi, %g3
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
 	set		PAGE_SIZE, %g7
+	prefetch	[%i1 + 0x00], #one_read
+	prefetch	[%i1 + 0x40], #one_read
 
-1:	ldda		[%o1 + %g0] ASI_BLK_INIT_QUAD_LDD_P, %o2
-	ldda		[%o1 + %g2] ASI_BLK_INIT_QUAD_LDD_P, %o4
-	prefetch	[%o1 + 0x40], #one_read
-	add		%o1, 32, %o1
-	stxa		%o2, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%o3, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
-	ldda		[%o1 + %g0] ASI_BLK_INIT_QUAD_LDD_P, %o2
-	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%o5, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
-	ldda		[%o1 + %g2] ASI_BLK_INIT_QUAD_LDD_P, %o4
-	add		%o1, 32, %o1
-	add		%o0, 32, %o0
-	stxa		%o2, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%o3, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%o5, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
-	subcc		%g7, 64, %g7
+1:	prefetch	[%i1 + 0x80], #one_read
+	prefetch	[%i1 + 0xc0], #one_read
+	ldda		[%i1 + 0x00] %asi, %o2
+	ldda		[%i1 + 0x10] %asi, %o4
+	ldda		[%i1 + 0x20] %asi, %l2
+	ldda		[%i1 + 0x30] %asi, %l4
+	stxa		%o2, [%i0 + 0x00] %asi
+	stxa		%o3, [%i0 + 0x08] %asi
+	stxa		%o4, [%i0 + 0x10] %asi
+	stxa		%o5, [%i0 + 0x18] %asi
+	stxa		%l2, [%i0 + 0x20] %asi
+	stxa		%l3, [%i0 + 0x28] %asi
+	stxa		%l4, [%i0 + 0x30] %asi
+	stxa		%l5, [%i0 + 0x38] %asi
+	ldda		[%i1 + 0x40] %asi, %o2
+	ldda		[%i1 + 0x50] %asi, %o4
+	ldda		[%i1 + 0x60] %asi, %l2
+	ldda		[%i1 + 0x70] %asi, %l4
+	stxa		%o2, [%i0 + 0x40] %asi
+	stxa		%o3, [%i0 + 0x48] %asi
+	stxa		%o4, [%i0 + 0x50] %asi
+	stxa		%o5, [%i0 + 0x58] %asi
+	stxa		%l2, [%i0 + 0x60] %asi
+	stxa		%l3, [%i0 + 0x68] %asi
+	stxa		%l4, [%i0 + 0x70] %asi
+	stxa		%l5, [%i0 + 0x78] %asi
+	add		%i1, 128, %i1
+	subcc		%g7, 128, %g7
 	bne,pt		%xcc, 1b
-	 add		%o0, 32, %o0
+	 add		%i0, 128, %i0
+	wr		%g3, 0x0, %asi
 	membar		#Sync
-	retl
-	 nop
+	ret
+	 restore
 
-	.globl		NGclear_page, NGclear_user_page
+	.align		32
+	.globl		NGclear_page
+	.globl		NGclear_user_page
 NGclear_page:		/* %o0=dest */
 NGclear_user_page:	/* %o0=dest, %o1=vaddr */
-	mov		8, %g1
-	mov		16, %g2
-	mov		24, %g3
+	rd		%asi, %g3
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
 	set		PAGE_SIZE, %g7
 
-1:	stxa		%g0, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%g0, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%g0, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%g0, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
-	add		%o0, 32, %o0
-	stxa		%g0, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%g0, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%g0, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%g0, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
-	subcc		%g7, 64, %g7
+1:	stxa		%g0, [%o0 + 0x00] %asi
+	stxa		%g0, [%o0 + 0x08] %asi
+	stxa		%g0, [%o0 + 0x10] %asi
+	stxa		%g0, [%o0 + 0x18] %asi
+	stxa		%g0, [%o0 + 0x20] %asi
+	stxa		%g0, [%o0 + 0x28] %asi
+	stxa		%g0, [%o0 + 0x30] %asi
+	stxa		%g0, [%o0 + 0x38] %asi
+	stxa		%g0, [%o0 + 0x40] %asi
+	stxa		%g0, [%o0 + 0x48] %asi
+	stxa		%g0, [%o0 + 0x50] %asi
+	stxa		%g0, [%o0 + 0x58] %asi
+	stxa		%g0, [%o0 + 0x60] %asi
+	stxa		%g0, [%o0 + 0x68] %asi
+	stxa		%g0, [%o0 + 0x70] %asi
+	stxa		%g0, [%o0 + 0x78] %asi
+	stxa		%g0, [%o0 + 0x80] %asi
+	stxa		%g0, [%o0 + 0x88] %asi
+	stxa		%g0, [%o0 + 0x90] %asi
+	stxa		%g0, [%o0 + 0x98] %asi
+	stxa		%g0, [%o0 + 0xa0] %asi
+	stxa		%g0, [%o0 + 0xa8] %asi
+	stxa		%g0, [%o0 + 0xb0] %asi
+	stxa		%g0, [%o0 + 0xb8] %asi
+	stxa		%g0, [%o0 + 0xc0] %asi
+	stxa		%g0, [%o0 + 0xc8] %asi
+	stxa		%g0, [%o0 + 0xd0] %asi
+	stxa		%g0, [%o0 + 0xd8] %asi
+	stxa		%g0, [%o0 + 0xe0] %asi
+	stxa		%g0, [%o0 + 0xe8] %asi
+	stxa		%g0, [%o0 + 0xf0] %asi
+	stxa		%g0, [%o0 + 0xf8] %asi
+	subcc		%g7, 256, %g7
 	bne,pt		%xcc, 1b
-	 add		%o0, 32, %o0
+	 add		%o0, 256, %o0
+	wr		%g3, 0x0, %asi
 	membar		#Sync
 	retl
 	 nop
diff --git a/arch/sparc/lib/U1memcpy.S b/arch/sparc/lib/U1memcpy.S
index bafd2fc07ac..b67142b7768 100644
--- a/arch/sparc/lib/U1memcpy.S
+++ b/arch/sparc/lib/U1memcpy.S
@@ -109,7 +109,7 @@
 #define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left)	\
 	subcc			%left, 8, %left;	\
 	bl,pn			%xcc, 95f;		\
-	 fsrc1			%f0, %f1;
+	 fsrc2			%f0, %f1;
 
 #define UNEVEN_VISCHUNK(dest, f0, f1, left)		\
 	UNEVEN_VISCHUNK_LAST(dest, f0, f1, left)	\
@@ -201,7 +201,7 @@ FUNC_NAME:		/* %o0=dst, %o1=src, %o2=len */
 	andn		%o1, (0x40 - 1), %o1
 	and		%g2, 7, %g2
 	andncc		%g3, 0x7, %g3
-	fmovd		%f0, %f2
+	fsrc2		%f0, %f2
 	sub		%g3, 0x8, %g3
 	sub		%o2, %GLOBAL_SPARE, %o2
 
diff --git a/arch/sparc/lib/ashldi3.S b/arch/sparc/lib/ashldi3.S
index 17912e60871..86f60de07b0 100644
--- a/arch/sparc/lib/ashldi3.S
+++ b/arch/sparc/lib/ashldi3.S
@@ -5,10 +5,10 @@
  * Copyright (C) 1999 David S. Miller (davem@redhat.com)
  */
 
+#include <linux/linkage.h>
+
 	.text
-	.align	4
-	.globl	__ashldi3
-__ashldi3:
+ENTRY(__ashldi3)
 	cmp	%o2, 0
 	be	9f
 	 mov	0x20, %g2
@@ -32,3 +32,4 @@ __ashldi3:
 9:
 	retl
 	 nop
+ENDPROC(__ashldi3)
diff --git a/arch/sparc/lib/ashrdi3.S b/arch/sparc/lib/ashrdi3.S
index 85398fd6dcc..6eb8ba2dd50 100644
--- a/arch/sparc/lib/ashrdi3.S
+++ b/arch/sparc/lib/ashrdi3.S
@@ -5,10 +5,10 @@
  * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
  */
 
+#include <linux/linkage.h>
+
 	.text
-	.align	4
-	.globl __ashrdi3
-__ashrdi3:
+ENTRY(__ashrdi3)
 	tst	%o2
 	be	3f
 	 or	%g0, 32, %g2
@@ -34,3 +34,4 @@ __ashrdi3:
 3:
 	jmpl	%o7 + 8, %g0
 	 nop
+ENDPROC(__ashrdi3)
diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c
index cbddeb38ffd..1d32b54089a 100644
--- a/arch/sparc/lib/atomic32.c
+++ b/arch/sparc/lib/atomic32.c
@@ -7,7 +7,7 @@
  * Based on asm-parisc/atomic.h Copyright (C) 2000 Philipp Rumpf
  */
 
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/spinlock.h>
 #include <linux/module.h>
 
@@ -16,7 +16,7 @@
 #define ATOMIC_HASH(a)	(&__atomic_hash[(((unsigned long)a)>>8) & (ATOMIC_HASH_SIZE-1)])
 
 spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] = {
-	[0 ... (ATOMIC_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
+	[0 ... (ATOMIC_HASH_SIZE-1)] = __SPIN_LOCK_UNLOCKED(__atomic_hash)
 };
 
 #else /* SMP */
@@ -55,7 +55,7 @@ int atomic_cmpxchg(atomic_t *v, int old, int new)
 }
 EXPORT_SYMBOL(atomic_cmpxchg);
 
-int atomic_add_unless(atomic_t *v, int a, int u)
+int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int ret;
 	unsigned long flags;
@@ -65,9 +65,9 @@ int atomic_add_unless(atomic_t *v, int a, int u)
 	if (ret != u)
 		v->counter += a;
 	spin_unlock_irqrestore(ATOMIC_HASH(v), flags);
-	return ret != u;
+	return ret;
 }
-EXPORT_SYMBOL(atomic_add_unless);
+EXPORT_SYMBOL(__atomic_add_unless);
 
 /* Atomic operations are already serializing */
 void atomic_set(atomic_t *v, int i)
diff --git a/arch/sparc/lib/atomic_32.S b/arch/sparc/lib/atomic_32.S
deleted file mode 100644
index 178cbb8ae1b..00000000000
--- a/arch/sparc/lib/atomic_32.S
+++ /dev/null
@@ -1,99 +0,0 @@
-/* atomic.S: Move this stuff here for better ICACHE hit rates.
- *
- * Copyright (C) 1996 David S. Miller (davem@caipfs.rutgers.edu)
- */
-
-#include <asm/ptrace.h>
-#include <asm/psr.h>
-
-	.text
-	.align	4
-
-	.globl  __atomic_begin
-__atomic_begin:
-
-#ifndef CONFIG_SMP
-	.globl	___xchg32_sun4c
-___xchg32_sun4c:
-	rd	%psr, %g3
-	andcc	%g3, PSR_PIL, %g0
-	bne	1f
-	 nop
-	wr	%g3, PSR_PIL, %psr
-	nop; nop; nop
-1:
-	andcc	%g3, PSR_PIL, %g0
-	ld	[%g1], %g7
-	bne	1f
-	 st	%g2, [%g1]
-	wr	%g3, 0x0, %psr
-	nop; nop; nop
-1:
-	mov	%g7, %g2
-	jmpl	%o7 + 8, %g0
-	 mov	%g4, %o7
-
-	.globl	___xchg32_sun4md
-___xchg32_sun4md:
-	swap	[%g1], %g2
-	jmpl	%o7 + 8, %g0
-	 mov	%g4, %o7
-#endif
-
-	/* Read asm-sparc/atomic.h carefully to understand how this works for SMP.
-	 * Really, some things here for SMP are overly clever, go read the header.
-	 */
-	.globl	___atomic24_add
-___atomic24_add:
-	rd	%psr, %g3		! Keep the code small, old way was stupid
-	nop; nop; nop;			! Let the bits set
-	or	%g3, PSR_PIL, %g7	! Disable interrupts
-	wr	%g7, 0x0, %psr		! Set %psr
-	nop; nop; nop;			! Let the bits set
-#ifdef CONFIG_SMP
-1:	ldstub	[%g1 + 3], %g7		! Spin on the byte lock for SMP.
-	orcc	%g7, 0x0, %g0		! Did we get it?
-	bne	1b			! Nope...
-	 ld	[%g1], %g7		! Load locked atomic24_t
-	sra	%g7, 8, %g7		! Get signed 24-bit integer
-	add	%g7, %g2, %g2		! Add in argument
-	sll	%g2, 8, %g7		! Transpose back to atomic24_t
-	st	%g7, [%g1]		! Clever: This releases the lock as well.
-#else
-	ld	[%g1], %g7		! Load locked atomic24_t
-	add	%g7, %g2, %g2		! Add in argument
-	st	%g2, [%g1]		! Store it back
-#endif
-	wr	%g3, 0x0, %psr		! Restore original PSR_PIL
-	nop; nop; nop;			! Let the bits set
-	jmpl	%o7, %g0		! NOTE: not + 8, see callers in atomic.h
-	 mov	%g4, %o7		! Restore %o7
-
-	.globl	___atomic24_sub
-___atomic24_sub:
-	rd	%psr, %g3		! Keep the code small, old way was stupid
-	nop; nop; nop;			! Let the bits set
-	or	%g3, PSR_PIL, %g7	! Disable interrupts
-	wr	%g7, 0x0, %psr		! Set %psr
-	nop; nop; nop;			! Let the bits set
-#ifdef CONFIG_SMP
-1:	ldstub	[%g1 + 3], %g7		! Spin on the byte lock for SMP.
-	orcc	%g7, 0x0, %g0		! Did we get it?
-	bne	1b			! Nope...
-	 ld	[%g1], %g7		! Load locked atomic24_t
-	sra	%g7, 8, %g7		! Get signed 24-bit integer
-	sub	%g7, %g2, %g2		! Subtract argument
-	sll	%g2, 8, %g7		! Transpose back to atomic24_t
-	st	%g7, [%g1]		! Clever: This releases the lock as well
-#else
-	ld	[%g1], %g7		! Load locked atomic24_t
-	sub	%g7, %g2, %g2		! Subtract argument
-	st	%g2, [%g1]		! Store it back
-#endif
-	wr	%g3, 0x0, %psr		! Restore original PSR_PIL
-	nop; nop; nop;			! Let the bits set
-	jmpl	%o7, %g0		! NOTE: not + 8, see callers in atomic.h
-	 mov	%g4, %o7		! Restore %o7
-
-	.globl  __atomic_end
-__atomic_end:
diff --git a/arch/sparc/lib/atomic_64.S b/arch/sparc/lib/atomic_64.S
index 59186e0fcf3..85c233d0a34 100644
--- a/arch/sparc/lib/atomic_64.S
+++ b/arch/sparc/lib/atomic_64.S
@@ -1,8 +1,9 @@
 /* atomic.S: These things are too big to do inline.
  *
- * Copyright (C) 1999, 2007 David S. Miller (davem@davemloft.net)
+ * Copyright (C) 1999, 2007 2012 David S. Miller (davem@davemloft.net)
  */
 
+#include <linux/linkage.h>
 #include <asm/asi.h>
 #include <asm/backoff.h>
 
@@ -13,9 +14,7 @@
 	 * memory barriers, and a second which returns
 	 * a value and does the barriers.
 	 */
-	.globl	atomic_add
-	.type	atomic_add,#function
-atomic_add: /* %o0 = increment, %o1 = atomic_ptr */
+ENTRY(atomic_add) /* %o0 = increment, %o1 = atomic_ptr */
 	BACKOFF_SETUP(%o2)
 1:	lduw	[%o1], %g1
 	add	%g1, %o0, %g7
@@ -26,11 +25,9 @@ atomic_add: /* %o0 = increment, %o1 = atomic_ptr */
 	retl
 	 nop
 2:	BACKOFF_SPIN(%o2, %o3, 1b)
-	.size	atomic_add, .-atomic_add
+ENDPROC(atomic_add)
 
-	.globl	atomic_sub
-	.type	atomic_sub,#function
-atomic_sub: /* %o0 = decrement, %o1 = atomic_ptr */
+ENTRY(atomic_sub) /* %o0 = decrement, %o1 = atomic_ptr */
 	BACKOFF_SETUP(%o2)
 1:	lduw	[%o1], %g1
 	sub	%g1, %o0, %g7
@@ -41,11 +38,9 @@ atomic_sub: /* %o0 = decrement, %o1 = atomic_ptr */
 	retl
 	 nop
 2:	BACKOFF_SPIN(%o2, %o3, 1b)
-	.size	atomic_sub, .-atomic_sub
+ENDPROC(atomic_sub)
 
-	.globl	atomic_add_ret
-	.type	atomic_add_ret,#function
-atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
+ENTRY(atomic_add_ret) /* %o0 = increment, %o1 = atomic_ptr */
 	BACKOFF_SETUP(%o2)
 1:	lduw	[%o1], %g1
 	add	%g1, %o0, %g7
@@ -56,11 +51,9 @@ atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
 	retl
 	 sra	%g1, 0, %o0
 2:	BACKOFF_SPIN(%o2, %o3, 1b)
-	.size	atomic_add_ret, .-atomic_add_ret
+ENDPROC(atomic_add_ret)
 
-	.globl	atomic_sub_ret
-	.type	atomic_sub_ret,#function
-atomic_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */
+ENTRY(atomic_sub_ret) /* %o0 = decrement, %o1 = atomic_ptr */
 	BACKOFF_SETUP(%o2)
 1:	lduw	[%o1], %g1
 	sub	%g1, %o0, %g7
@@ -71,11 +64,9 @@ atomic_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */
 	retl
 	 sra	%g1, 0, %o0
 2:	BACKOFF_SPIN(%o2, %o3, 1b)
-	.size	atomic_sub_ret, .-atomic_sub_ret
+ENDPROC(atomic_sub_ret)
 
-	.globl	atomic64_add
-	.type	atomic64_add,#function
-atomic64_add: /* %o0 = increment, %o1 = atomic_ptr */
+ENTRY(atomic64_add) /* %o0 = increment, %o1 = atomic_ptr */
 	BACKOFF_SETUP(%o2)
 1:	ldx	[%o1], %g1
 	add	%g1, %o0, %g7
@@ -86,11 +77,9 @@ atomic64_add: /* %o0 = increment, %o1 = atomic_ptr */
 	retl
 	 nop
 2:	BACKOFF_SPIN(%o2, %o3, 1b)
-	.size	atomic64_add, .-atomic64_add
+ENDPROC(atomic64_add)
 
-	.globl	atomic64_sub
-	.type	atomic64_sub,#function
-atomic64_sub: /* %o0 = decrement, %o1 = atomic_ptr */
+ENTRY(atomic64_sub) /* %o0 = decrement, %o1 = atomic_ptr */
 	BACKOFF_SETUP(%o2)
 1:	ldx	[%o1], %g1
 	sub	%g1, %o0, %g7
@@ -101,11 +90,9 @@ atomic64_sub: /* %o0 = decrement, %o1 = atomic_ptr */
 	retl
 	 nop
 2:	BACKOFF_SPIN(%o2, %o3, 1b)
-	.size	atomic64_sub, .-atomic64_sub
+ENDPROC(atomic64_sub)
 
-	.globl	atomic64_add_ret
-	.type	atomic64_add_ret,#function
-atomic64_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
+ENTRY(atomic64_add_ret) /* %o0 = increment, %o1 = atomic_ptr */
 	BACKOFF_SETUP(%o2)
 1:	ldx	[%o1], %g1
 	add	%g1, %o0, %g7
@@ -116,11 +103,9 @@ atomic64_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
 	retl
 	 add	%g1, %o0, %o0
 2:	BACKOFF_SPIN(%o2, %o3, 1b)
-	.size	atomic64_add_ret, .-atomic64_add_ret
+ENDPROC(atomic64_add_ret)
 
-	.globl	atomic64_sub_ret
-	.type	atomic64_sub_ret,#function
-atomic64_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */
+ENTRY(atomic64_sub_ret) /* %o0 = decrement, %o1 = atomic_ptr */
 	BACKOFF_SETUP(%o2)
 1:	ldx	[%o1], %g1
 	sub	%g1, %o0, %g7
@@ -131,4 +116,18 @@ atomic64_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */
 	retl
 	 sub	%g1, %o0, %o0
 2:	BACKOFF_SPIN(%o2, %o3, 1b)
-	.size	atomic64_sub_ret, .-atomic64_sub_ret
+ENDPROC(atomic64_sub_ret)
+
+ENTRY(atomic64_dec_if_positive) /* %o0 = atomic_ptr */
+	BACKOFF_SETUP(%o2)
+1:	ldx	[%o0], %g1
+	brlez,pn %g1, 3f
+	 sub	%g1, 1, %g7
+	casx	[%o0], %g1, %g7
+	cmp	%g1, %g7
+	bne,pn	%xcc, BACKOFF_LABEL(2f, 1b)
+	 nop
+3:	retl
+	 sub	%g1, 1, %o0
+2:	BACKOFF_SPIN(%o2, %o3, 1b)
+ENDPROC(atomic64_dec_if_positive)
diff --git a/arch/sparc/lib/bitext.c b/arch/sparc/lib/bitext.c
index 764b3eb7b60..8ec4e9c0251 100644
--- a/arch/sparc/lib/bitext.c
+++ b/arch/sparc/lib/bitext.c
@@ -10,7 +10,7 @@
  */
 
 #include <linux/string.h>
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
 
 #include <asm/bitext.h>
 
@@ -80,8 +80,7 @@ int bit_map_string_get(struct bit_map *t, int len, int align)
 		while (test_bit(offset + i, t->map) == 0) {
 			i++;
 			if (i == len) {
-				for (i = 0; i < len; i++)
-					__set_bit(offset + i, t->map);
+				bitmap_set(t->map, offset, len);
 				if (offset == t->first_free)
 					t->first_free = find_next_zero_bit
 							(t->map, t->size,
@@ -120,11 +119,7 @@ void bit_map_clear(struct bit_map *t, int offset, int len)
 
 void bit_map_init(struct bit_map *t, unsigned long *map, int size)
 {
-
-	if ((size & 07) != 0)
-		BUG();
-	memset(map, 0, size>>3);
-
+	bitmap_zero(map, size);
 	memset(t, 0, sizeof *t);
 	spin_lock_init(&t->lock);
 	t->map = map;
diff --git a/arch/sparc/lib/bitops.S b/arch/sparc/lib/bitops.S
index 3dc61d5537c..36f72cc0e67 100644
--- a/arch/sparc/lib/bitops.S
+++ b/arch/sparc/lib/bitops.S
@@ -3,14 +3,13 @@
  * Copyright (C) 2000, 2007 David S. Miller (davem@davemloft.net)
  */
 
+#include <linux/linkage.h>
 #include <asm/asi.h>
 #include <asm/backoff.h>
 
 	.text
 
-	.globl	test_and_set_bit
-	.type	test_and_set_bit,#function
-test_and_set_bit:	/* %o0=nr, %o1=addr */
+ENTRY(test_and_set_bit)	/* %o0=nr, %o1=addr */
 	BACKOFF_SETUP(%o3)
 	srlx	%o0, 6, %g1
 	mov	1, %o2
@@ -29,11 +28,9 @@ test_and_set_bit:	/* %o0=nr, %o1=addr */
 	retl
 	 nop
 2:	BACKOFF_SPIN(%o3, %o4, 1b)
-	.size	test_and_set_bit, .-test_and_set_bit
+ENDPROC(test_and_set_bit)
 
-	.globl	test_and_clear_bit
-	.type	test_and_clear_bit,#function
-test_and_clear_bit:	/* %o0=nr, %o1=addr */
+ENTRY(test_and_clear_bit) /* %o0=nr, %o1=addr */
 	BACKOFF_SETUP(%o3)
 	srlx	%o0, 6, %g1
 	mov	1, %o2
@@ -52,11 +49,9 @@ test_and_clear_bit:	/* %o0=nr, %o1=addr */
 	retl
 	 nop
 2:	BACKOFF_SPIN(%o3, %o4, 1b)
-	.size	test_and_clear_bit, .-test_and_clear_bit
+ENDPROC(test_and_clear_bit)
 
-	.globl	test_and_change_bit
-	.type	test_and_change_bit,#function
-test_and_change_bit:	/* %o0=nr, %o1=addr */
+ENTRY(test_and_change_bit) /* %o0=nr, %o1=addr */
 	BACKOFF_SETUP(%o3)
 	srlx	%o0, 6, %g1
 	mov	1, %o2
@@ -75,11 +70,9 @@ test_and_change_bit:	/* %o0=nr, %o1=addr */
 	retl
 	 nop
 2:	BACKOFF_SPIN(%o3, %o4, 1b)
-	.size	test_and_change_bit, .-test_and_change_bit
+ENDPROC(test_and_change_bit)
 
-	.globl	set_bit
-	.type	set_bit,#function
-set_bit:		/* %o0=nr, %o1=addr */
+ENTRY(set_bit) /* %o0=nr, %o1=addr */
 	BACKOFF_SETUP(%o3)
 	srlx	%o0, 6, %g1
 	mov	1, %o2
@@ -96,11 +89,9 @@ set_bit:		/* %o0=nr, %o1=addr */
 	retl
 	 nop
 2:	BACKOFF_SPIN(%o3, %o4, 1b)
-	.size	set_bit, .-set_bit
+ENDPROC(set_bit)
 
-	.globl	clear_bit
-	.type	clear_bit,#function
-clear_bit:		/* %o0=nr, %o1=addr */
+ENTRY(clear_bit) /* %o0=nr, %o1=addr */
 	BACKOFF_SETUP(%o3)
 	srlx	%o0, 6, %g1
 	mov	1, %o2
@@ -117,11 +108,9 @@ clear_bit:		/* %o0=nr, %o1=addr */
 	retl
 	 nop
 2:	BACKOFF_SPIN(%o3, %o4, 1b)
-	.size	clear_bit, .-clear_bit
+ENDPROC(clear_bit)
 
-	.globl	change_bit
-	.type	change_bit,#function
-change_bit:		/* %o0=nr, %o1=addr */
+ENTRY(change_bit) /* %o0=nr, %o1=addr */
 	BACKOFF_SETUP(%o3)
 	srlx	%o0, 6, %g1
 	mov	1, %o2
@@ -138,4 +127,4 @@ change_bit:		/* %o0=nr, %o1=addr */
 	retl
 	 nop
 2:	BACKOFF_SPIN(%o3, %o4, 1b)
-	.size	change_bit, .-change_bit
+ENDPROC(change_bit)
diff --git a/arch/sparc/lib/blockops.S b/arch/sparc/lib/blockops.S
index 804be87f9a4..3c771011ff4 100644
--- a/arch/sparc/lib/blockops.S
+++ b/arch/sparc/lib/blockops.S
@@ -4,6 +4,7 @@
  * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
  */
 
+#include <linux/linkage.h>
 #include <asm/page.h>
 
 	/* Zero out 64 bytes of memory at (buf + offset).
@@ -44,10 +45,7 @@
 	 */
 
 	.text
-	.align	4
-	.globl	bzero_1page, __copy_1page
-
-bzero_1page:
+ENTRY(bzero_1page)
 /* NOTE: If you change the number of insns of this routine, please check
  * arch/sparc/mm/hypersparc.S */
 	/* %o0 = buf */
@@ -65,8 +63,9 @@ bzero_1page:
 
 	retl
 	 nop
+ENDPROC(bzero_1page)
 
-__copy_1page:
+ENTRY(__copy_1page)
 /* NOTE: If you change the number of insns of this routine, please check
  * arch/sparc/mm/hypersparc.S */
 	/* %o0 = dst, %o1 = src */
@@ -87,3 +86,4 @@ __copy_1page:
 
 	retl
 	 nop
+ENDPROC(__copy_1page)
diff --git a/arch/sparc/lib/bzero.S b/arch/sparc/lib/bzero.S
index 615f401edf6..8c058114b64 100644
--- a/arch/sparc/lib/bzero.S
+++ b/arch/sparc/lib/bzero.S
@@ -4,11 +4,11 @@
  * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
  */
 
+#include <linux/linkage.h>
+
 	.text
 
-	.globl	memset
-	.type	memset, #function
-memset:			/* %o0=buf, %o1=pat, %o2=len */
+ENTRY(memset) /* %o0=buf, %o1=pat, %o2=len */
 	and		%o1, 0xff, %o3
 	mov		%o2, %o1
 	sllx		%o3, 8, %g1
@@ -19,9 +19,7 @@ memset:			/* %o0=buf, %o1=pat, %o2=len */
 	ba,pt		%xcc, 1f
 	 or		%g1, %o2, %o2
 
-	.globl	__bzero
-	.type	__bzero, #function
-__bzero:		/* %o0=buf, %o1=len */
+ENTRY(__bzero) /* %o0=buf, %o1=len */
 	clr		%o2
 1:	mov		%o0, %o3
 	brz,pn		%o1, __bzero_done
@@ -78,8 +76,8 @@ __bzero_tiny:
 __bzero_done:
 	retl
 	 mov		%o3, %o0
-	.size		__bzero, .-__bzero
-	.size		memset, .-memset
+ENDPROC(__bzero)
+ENDPROC(memset)
 
 #define EX_ST(x,y)		\
 98:	x,y;			\
@@ -89,9 +87,7 @@ __bzero_done:
 	.text;			\
 	.align 4;
 
-	.globl	__clear_user
-	.type	__clear_user, #function
-__clear_user:		/* %o0=buf, %o1=len */
+ENTRY(__clear_user) /* %o0=buf, %o1=len */
 	brz,pn		%o1, __clear_user_done
 	 cmp		%o1, 16
 	bl,pn		%icc, __clear_user_tiny
@@ -146,4 +142,4 @@ __clear_user_tiny:
 __clear_user_done:
 	retl
 	 clr		%o0
-	.size		__clear_user, .-__clear_user
+ENDPROC(__clear_user)
diff --git a/arch/sparc/lib/checksum_32.S b/arch/sparc/lib/checksum_32.S
index 3632cb34e91..0084c3361e1 100644
--- a/arch/sparc/lib/checksum_32.S
+++ b/arch/sparc/lib/checksum_32.S
@@ -289,10 +289,16 @@ cc_end_cruft:
 
 	/* Also, handle the alignment code out of band. */
 cc_dword_align:
-	cmp	%g1, 6
-	bl,a	ccte
+	cmp	%g1, 16
+	bge	1f
+	 srl	%g1, 1, %o3
+2:	cmp	%o3, 0
+	be,a	ccte
 	 andcc	%g1, 0xf, %o3
-	andcc	%o0, 0x1, %g0
+	andcc	%o3, %o0, %g0	! Check %o0 only (%o1 has the same last 2 bits)
+	be,a	2b
+	 srl	%o3, 1, %o3
+1:	andcc	%o0, 0x1, %g0
 	bne	ccslow
 	 andcc	%o0, 0x2, %g0
 	be	1f
diff --git a/arch/sparc/lib/clear_page.S b/arch/sparc/lib/clear_page.S
index 77e531f6c2a..46272dfc26e 100644
--- a/arch/sparc/lib/clear_page.S
+++ b/arch/sparc/lib/clear_page.S
@@ -37,10 +37,10 @@ _clear_page:		/* %o0=dest */
 	.globl		clear_user_page
 clear_user_page:	/* %o0=dest, %o1=vaddr */
 	lduw		[%g6 + TI_PRE_COUNT], %o2
-	sethi		%uhi(PAGE_OFFSET), %g2
+	sethi		%hi(PAGE_OFFSET), %g2
 	sethi		%hi(PAGE_SIZE), %o4
 
-	sllx		%g2, 32, %g2
+	ldx		[%g2 + %lo(PAGE_OFFSET)], %g2
 	sethi		%hi(PAGE_KERNEL_LOCKED), %g3
 
 	ldx		[%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3
diff --git a/arch/sparc/lib/copy_page.S b/arch/sparc/lib/copy_page.S
index b243d3b606b..dd16c61f326 100644
--- a/arch/sparc/lib/copy_page.S
+++ b/arch/sparc/lib/copy_page.S
@@ -34,10 +34,10 @@
 #endif
 
 #define TOUCH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7)	\
-	fmovd	%reg0, %f48; 	fmovd	%reg1, %f50;		\
-	fmovd	%reg2, %f52; 	fmovd	%reg3, %f54;		\
-	fmovd	%reg4, %f56; 	fmovd	%reg5, %f58;		\
-	fmovd	%reg6, %f60; 	fmovd	%reg7, %f62;
+	fsrc2	%reg0, %f48; 	fsrc2	%reg1, %f50;		\
+	fsrc2	%reg2, %f52; 	fsrc2	%reg3, %f54;		\
+	fsrc2	%reg4, %f56; 	fsrc2	%reg5, %f58;		\
+	fsrc2	%reg6, %f60; 	fsrc2	%reg7, %f62;
 
 	.text
 
@@ -46,10 +46,10 @@
 	.type		copy_user_page,#function
 copy_user_page:		/* %o0=dest, %o1=src, %o2=vaddr */
 	lduw		[%g6 + TI_PRE_COUNT], %o4
-	sethi		%uhi(PAGE_OFFSET), %g2
+	sethi		%hi(PAGE_OFFSET), %g2
 	sethi		%hi(PAGE_SIZE), %o3
 
-	sllx		%g2, 32, %g2
+	ldx		[%g2 + %lo(PAGE_OFFSET)], %g2
 	sethi		%hi(PAGE_KERNEL_LOCKED), %g3
 
 	ldx		[%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3
@@ -104,60 +104,60 @@ cheetah_copy_page_insn:
 	prefetch	[%o1 + 0x140], #one_read
 	ldd		[%o1 + 0x010], %f4
 	prefetch	[%o1 + 0x180], #one_read
-	fmovd		%f0, %f16
+	fsrc2		%f0, %f16
 	ldd		[%o1 + 0x018], %f6
-	fmovd		%f2, %f18
+	fsrc2		%f2, %f18
 	ldd		[%o1 + 0x020], %f8
-	fmovd		%f4, %f20
+	fsrc2		%f4, %f20
 	ldd		[%o1 + 0x028], %f10
-	fmovd		%f6, %f22
+	fsrc2		%f6, %f22
 	ldd		[%o1 + 0x030], %f12
-	fmovd		%f8, %f24
+	fsrc2		%f8, %f24
 	ldd		[%o1 + 0x038], %f14
-	fmovd		%f10, %f26
+	fsrc2		%f10, %f26
 	ldd		[%o1 + 0x040], %f0
 1:	ldd		[%o1 + 0x048], %f2
-	fmovd		%f12, %f28
+	fsrc2		%f12, %f28
 	ldd		[%o1 + 0x050], %f4
-	fmovd		%f14, %f30
+	fsrc2		%f14, %f30
 	stda		%f16, [%o0] ASI_BLK_P
 	ldd		[%o1 + 0x058], %f6
-	fmovd		%f0, %f16
+	fsrc2		%f0, %f16
 	ldd		[%o1 + 0x060], %f8
-	fmovd		%f2, %f18
+	fsrc2		%f2, %f18
 	ldd		[%o1 + 0x068], %f10
-	fmovd		%f4, %f20
+	fsrc2		%f4, %f20
 	ldd		[%o1 + 0x070], %f12
-	fmovd		%f6, %f22
+	fsrc2		%f6, %f22
 	ldd		[%o1 + 0x078], %f14
-	fmovd		%f8, %f24
+	fsrc2		%f8, %f24
 	ldd		[%o1 + 0x080], %f0
 	prefetch	[%o1 + 0x180], #one_read
-	fmovd		%f10, %f26
+	fsrc2		%f10, %f26
 	subcc		%o2, 1, %o2
 	add		%o0, 0x40, %o0
 	bne,pt		%xcc, 1b
 	 add		%o1, 0x40, %o1
 
 	ldd		[%o1 + 0x048], %f2
-	fmovd		%f12, %f28
+	fsrc2		%f12, %f28
 	ldd		[%o1 + 0x050], %f4
-	fmovd		%f14, %f30
+	fsrc2		%f14, %f30
 	stda		%f16, [%o0] ASI_BLK_P
 	ldd		[%o1 + 0x058], %f6
-	fmovd		%f0, %f16
+	fsrc2		%f0, %f16
 	ldd		[%o1 + 0x060], %f8
-	fmovd		%f2, %f18
+	fsrc2		%f2, %f18
 	ldd		[%o1 + 0x068], %f10
-	fmovd		%f4, %f20
+	fsrc2		%f4, %f20
 	ldd		[%o1 + 0x070], %f12
-	fmovd		%f6, %f22
+	fsrc2		%f6, %f22
 	add		%o0, 0x40, %o0
 	ldd		[%o1 + 0x078], %f14
-	fmovd		%f8, %f24
-	fmovd		%f10, %f26
-	fmovd		%f12, %f28
-	fmovd		%f14, %f30
+	fsrc2		%f8, %f24
+	fsrc2		%f10, %f26
+	fsrc2		%f12, %f28
+	fsrc2		%f14, %f30
 	stda		%f16, [%o0] ASI_BLK_P
 	membar		#Sync
 	VISExitHalf
diff --git a/arch/sparc/lib/divdi3.S b/arch/sparc/lib/divdi3.S
index 681b3683da9..9614b48b6ef 100644
--- a/arch/sparc/lib/divdi3.S
+++ b/arch/sparc/lib/divdi3.S
@@ -17,21 +17,6 @@ along with GNU CC; see the file COPYING.  If not, write to
 the Free Software Foundation, 59 Temple Place - Suite 330,
 Boston, MA 02111-1307, USA.  */
 
-	.data
-	.align 8
-	.globl	__clz_tab
-__clz_tab:
-	.byte	0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
-	.byte	6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
-	.byte	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
-	.byte	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
-	.byte	8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
-	.byte	8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
-	.byte	8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
-	.byte	8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
-	.size	 __clz_tab,256
-	.global .udiv
-
 	.text
 	.align 4
 	.globl __divdi3
@@ -97,8 +82,9 @@ __divdi3:
 	bne .LL85
 	mov %i0,%o2
 	mov 1,%o0
-	call .udiv,0
 	mov 0,%o1
+	wr %g0, 0, %y
+	udiv %o0, %o1, %o0
 	mov %o0,%o4
 	mov %i0,%o2
 .LL85:
diff --git a/arch/sparc/lib/ffs.S b/arch/sparc/lib/ffs.S
new file mode 100644
index 00000000000..b39389f6989
--- /dev/null
+++ b/arch/sparc/lib/ffs.S
@@ -0,0 +1,84 @@
+#include <linux/linkage.h>
+
+	.register	%g2,#scratch
+
+	.text
+	.align	32
+
+ENTRY(ffs)
+	brnz,pt	%o0, 1f
+	 mov	1, %o1
+	retl
+	 clr	%o0
+	nop
+	nop
+ENTRY(__ffs)
+	sllx	%o0, 32, %g1		/* 1  */
+	srlx	%o0, 32, %g2
+
+	clr	%o1			/* 2  */
+	movrz	%g1, %g2, %o0
+
+	movrz	%g1, 32, %o1		/* 3  */
+1:	clr	%o2
+
+	sllx	%o0, (64 - 16), %g1	/* 4  */
+	srlx	%o0, 16, %g2
+
+	movrz	%g1, %g2, %o0		/* 5  */
+	clr	%o3
+
+	movrz	%g1, 16, %o2		/* 6  */
+	clr	%o4
+
+	and	%o0, 0xff, %g1		/* 7  */
+	srlx	%o0, 8, %g2
+
+	movrz	%g1, %g2, %o0		/* 8  */
+	clr	%o5
+
+	movrz	%g1, 8, %o3		/* 9  */
+	add	%o2, %o1, %o2
+
+	and	%o0, 0xf, %g1		/* 10 */
+	srlx	%o0, 4, %g2
+
+	movrz	%g1, %g2, %o0		/* 11 */
+	add	%o2, %o3, %o2
+
+	movrz	%g1, 4, %o4		/* 12 */
+
+	and	%o0, 0x3, %g1		/* 13 */
+	srlx	%o0, 2, %g2
+
+	movrz	%g1, %g2, %o0		/* 14 */
+	add	%o2, %o4, %o2
+
+	movrz	%g1, 2, %o5		/* 15 */
+
+	and	%o0, 0x1, %g1		/* 16 */
+
+	add	%o2, %o5, %o2		/* 17 */
+	xor	%g1, 0x1, %g1
+
+	retl				/* 18 */
+	 add	%o2, %g1, %o0
+ENDPROC(ffs)
+ENDPROC(__ffs)
+
+	.section	.popc_6insn_patch, "ax"
+	.word		ffs
+	brz,pn	%o0, 98f
+	 neg	%o0, %g1
+	xnor	%o0, %g1, %o1
+	popc	%o1, %o0
+98:	retl
+	 nop
+	.word		__ffs
+	neg	%o0, %g1
+	xnor	%o0, %g1, %o1
+	popc	%o1, %o0
+	retl
+	 sub	%o0, 1, %o0
+	nop
+	.previous
diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S
new file mode 100644
index 00000000000..95414e0a680
--- /dev/null
+++ b/arch/sparc/lib/hweight.S
@@ -0,0 +1,51 @@
+#include <linux/linkage.h>
+
+	.text
+	.align	32
+ENTRY(__arch_hweight8)
+	ba,pt	%xcc, __sw_hweight8
+	 nop
+	nop
+ENDPROC(__arch_hweight8)
+	.section	.popc_3insn_patch, "ax"
+	.word		__arch_hweight8
+	sllx		%o0, 64-8, %g1
+	retl
+	 popc		%g1, %o0
+	.previous
+
+ENTRY(__arch_hweight16)
+	ba,pt	%xcc, __sw_hweight16
+	 nop
+	nop
+ENDPROC(__arch_hweight16)
+	.section	.popc_3insn_patch, "ax"
+	.word		__arch_hweight16
+	sllx		%o0, 64-16, %g1
+	retl
+	 popc		%g1, %o0
+	.previous
+
+ENTRY(__arch_hweight32)
+	ba,pt	%xcc, __sw_hweight32
+	 nop
+	nop
+ENDPROC(__arch_hweight32)
+	.section	.popc_3insn_patch, "ax"
+	.word		__arch_hweight32
+	sllx		%o0, 64-32, %g1
+	retl
+	 popc		%g1, %o0
+	.previous
+
+ENTRY(__arch_hweight64)
+	ba,pt	%xcc, __sw_hweight64
+	 nop
+	nop
+ENDPROC(__arch_hweight64)
+	.section	.popc_3insn_patch, "ax"
+	.word		__arch_hweight64
+	retl
+	 popc		%o0, %o0
+	nop
+	.previous
diff --git a/arch/sparc/lib/iomap.c b/arch/sparc/lib/iomap.c
index 9ef37e13a92..c4d42a50ebc 100644
--- a/arch/sparc/lib/iomap.c
+++ b/arch/sparc/lib/iomap.c
@@ -18,31 +18,8 @@ void ioport_unmap(void __iomem *addr)
 EXPORT_SYMBOL(ioport_map);
 EXPORT_SYMBOL(ioport_unmap);
 
-/* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
-{
-	resource_size_t start = pci_resource_start(dev, bar);
-	resource_size_t len = pci_resource_len(dev, bar);
-	unsigned long flags = pci_resource_flags(dev, bar);
-
-	if (!len || !start)
-		return NULL;
-	if (maxlen && len > maxlen)
-		len = maxlen;
-	if (flags & IORESOURCE_IO)
-		return ioport_map(start, len);
-	if (flags & IORESOURCE_MEM) {
-		if (flags & IORESOURCE_CACHEABLE)
-			return ioremap(start, len);
-		return ioremap_nocache(start, len);
-	}
-	/* What? */
-	return NULL;
-}
-
 void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
 {
 	/* nothing to do */
 }
-EXPORT_SYMBOL(pci_iomap);
 EXPORT_SYMBOL(pci_iounmap);
diff --git a/arch/sparc/lib/ipcsum.S b/arch/sparc/lib/ipcsum.S
index 58ca5b9a877..4742d59029e 100644
--- a/arch/sparc/lib/ipcsum.S
+++ b/arch/sparc/lib/ipcsum.S
@@ -1,8 +1,7 @@
+#include <linux/linkage.h>
+
 	.text
-	.align	32
-	.globl	ip_fast_csum
-	.type	ip_fast_csum,#function
-ip_fast_csum:	/* %o0 = iph, %o1 = ihl */
+ENTRY(ip_fast_csum) /* %o0 = iph, %o1 = ihl */
 	sub	%o1, 4, %g7
 	lduw	[%o0 + 0x00], %o2
 	lduw	[%o0 + 0x04], %g2
@@ -31,4 +30,4 @@ ip_fast_csum:	/* %o0 = iph, %o1 = ihl */
 	set	0xffff, %o1
 	retl
 	 and	%o2, %o1, %o0
-	.size	ip_fast_csum, .-ip_fast_csum
+ENDPROC(ip_fast_csum)
diff --git a/arch/sparc/lib/ksyms.c b/arch/sparc/lib/ksyms.c
index 1b30bb3bfdb..323335b9cd2 100644
--- a/arch/sparc/lib/ksyms.c
+++ b/arch/sparc/lib/ksyms.c
@@ -15,8 +15,6 @@
 
 /* string functions */
 EXPORT_SYMBOL(strlen);
-EXPORT_SYMBOL(__strlen_user);
-EXPORT_SYMBOL(__strnlen_user);
 EXPORT_SYMBOL(strncmp);
 
 /* mem* functions */
@@ -33,9 +31,6 @@ EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(__bzero);
 
-/* Moving data to/from/in userspace. */
-EXPORT_SYMBOL(__strncpy_from_user);
-
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial);
 
@@ -56,24 +51,10 @@ extern int __divdi3(int, int);
 extern void (*__copy_1page)(void *, const void *);
 extern void (*bzero_1page)(void *);
 
-extern int __strncmp(const char *, const char *, __kernel_size_t);
-
 extern void ___rw_read_enter(void);
 extern void ___rw_read_try(void);
 extern void ___rw_read_exit(void);
 extern void ___rw_write_enter(void);
-extern void ___atomic24_add(void);
-extern void ___atomic24_sub(void);
-
-/* Alias functions whose names begin with "." and export the aliases.
- * The module references will be fixed up by module_frob_arch_sections.
- */
-extern int _Div(int, int);
-extern int _Mul(int, int);
-extern int _Rem(int, int);
-extern unsigned _Udiv(unsigned, unsigned);
-extern unsigned _Umul(unsigned, unsigned);
-extern unsigned _Urem(unsigned, unsigned);
 
 /* Networking helper routines. */
 EXPORT_SYMBOL(__csum_partial_copy_sparc_generic);
@@ -83,9 +64,6 @@ EXPORT_SYMBOL(__copy_1page);
 EXPORT_SYMBOL(__memmove);
 EXPORT_SYMBOL(bzero_1page);
 
-/* string functions */
-EXPORT_SYMBOL(__strncmp);
-
 /* Moving data to/from/in userspace. */
 EXPORT_SYMBOL(__copy_user);
 
@@ -97,22 +75,11 @@ EXPORT_SYMBOL(___rw_read_exit);
 EXPORT_SYMBOL(___rw_write_enter);
 #endif
 
-/* Atomic operations. */
-EXPORT_SYMBOL(___atomic24_add);
-EXPORT_SYMBOL(___atomic24_sub);
-
 EXPORT_SYMBOL(__ashrdi3);
 EXPORT_SYMBOL(__ashldi3);
 EXPORT_SYMBOL(__lshrdi3);
 EXPORT_SYMBOL(__muldi3);
 EXPORT_SYMBOL(__divdi3);
-
-EXPORT_SYMBOL(_Rem);
-EXPORT_SYMBOL(_Urem);
-EXPORT_SYMBOL(_Mul);
-EXPORT_SYMBOL(_Umul);
-EXPORT_SYMBOL(_Div);
-EXPORT_SYMBOL(_Udiv);
 #endif
 
 /*
@@ -131,15 +98,6 @@ EXPORT_SYMBOL(___copy_from_user);
 EXPORT_SYMBOL(___copy_in_user);
 EXPORT_SYMBOL(__clear_user);
 
-/* RW semaphores */
-EXPORT_SYMBOL(__down_read);
-EXPORT_SYMBOL(__down_read_trylock);
-EXPORT_SYMBOL(__down_write);
-EXPORT_SYMBOL(__down_write_trylock);
-EXPORT_SYMBOL(__up_read);
-EXPORT_SYMBOL(__up_write);
-EXPORT_SYMBOL(__downgrade_write);
-
 /* Atomic counter implementation. */
 EXPORT_SYMBOL(atomic_add);
 EXPORT_SYMBOL(atomic_add_ret);
@@ -149,6 +107,7 @@ EXPORT_SYMBOL(atomic64_add);
 EXPORT_SYMBOL(atomic64_add_ret);
 EXPORT_SYMBOL(atomic64_sub);
 EXPORT_SYMBOL(atomic64_sub_ret);
+EXPORT_SYMBOL(atomic64_dec_if_positive);
 
 /* Atomic bit operations. */
 EXPORT_SYMBOL(test_and_set_bit);
@@ -167,6 +126,10 @@ EXPORT_SYMBOL(copy_user_page);
 void VISenter(void);
 EXPORT_SYMBOL(VISenter);
 
+/* CRYPTO code needs this */
+void VISenterhalf(void);
+EXPORT_SYMBOL(VISenterhalf);
+
 extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
 extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *,
 		unsigned long *);
diff --git a/arch/sparc/lib/lshrdi3.S b/arch/sparc/lib/lshrdi3.S
index 47a1354c160..60ebc7cdbee 100644
--- a/arch/sparc/lib/lshrdi3.S
+++ b/arch/sparc/lib/lshrdi3.S
@@ -1,6 +1,6 @@
+#include <linux/linkage.h>
 
-	.globl	__lshrdi3
-__lshrdi3:
+ENTRY(__lshrdi3)
 	cmp	%o2, 0
 	be	3f
 	 mov	0x20, %g2
@@ -24,3 +24,4 @@ __lshrdi3:
 3:
 	retl 
 	 nop 
+ENDPROC(__lshrdi3)
diff --git a/arch/sparc/lib/memcpy.S b/arch/sparc/lib/memcpy.S
index 34fe6575173..4d8c497517b 100644
--- a/arch/sparc/lib/memcpy.S
+++ b/arch/sparc/lib/memcpy.S
@@ -7,40 +7,12 @@
  * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
  */
 
-#ifdef __KERNEL__
-
-#define FUNC(x) 											\
+#define FUNC(x) 		\
 	.globl	x;		\
 	.type	x,@function;	\
-	.align	4;											\
+	.align	4;		\
 x:
 
-#undef FASTER_REVERSE
-#undef FASTER_NONALIGNED
-#define FASTER_ALIGNED
-
-/* In kernel these functions don't return a value.
- * One should use macros in asm/string.h for that purpose.
- * We return 0, so that bugs are more apparent.
- */
-#define SETUP_RETL
-#define RETL_INSN	clr	%o0
-
-#else
-
-/* libc */
-
-#include "DEFS.h"
-
-#define FASTER_REVERSE
-#define FASTER_NONALIGNED
-#define FASTER_ALIGNED
-
-#define SETUP_RETL	mov	%o0, %g6
-#define RETL_INSN	mov	%g6, %o0
-
-#endif
-
 /* Both these macros have to start with exactly the same insn */
 #define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
 	ldd	[%src + (offset) + 0x00], %t0; \
@@ -164,30 +136,6 @@ x:
 	.text
 	.align	4
 
-#ifdef FASTER_REVERSE
-
-70:	/* rdword_align */
-
-	andcc		%o1, 1, %g0
-	be		4f
-	 andcc		%o1, 2, %g0
-
-	ldub		[%o1 - 1], %g2
-	sub		%o1, 1, %o1
-	stb		%g2, [%o0 - 1]
-	sub		%o2, 1, %o2
-	be		3f
-	 sub		%o0, 1, %o0
-4:
-	lduh		[%o1 - 2], %g2
-	sub		%o1, 2, %o1
-	sth		%g2, [%o0 - 2]
-	sub		%o2, 2, %o2
-	b		3f
-	 sub		%o0, 2, %o0
-
-#endif /* FASTER_REVERSE */
-
 0:
 	retl
 	 nop		! Only bcopy returns here and it retuns void...
@@ -198,7 +146,7 @@ FUNC(__memmove)
 #endif
 FUNC(memmove)
 	cmp		%o0, %o1
-	SETUP_RETL
+	mov		%o0, %g7
 	bleu		9f
 	 sub		%o0, %o1, %o4
 
@@ -207,8 +155,6 @@ FUNC(memmove)
 	bleu		0f
 	 andcc		%o4, 3, %o5
 
-#ifndef FASTER_REVERSE
-
 	add		%o1, %o2, %o1
 	add		%o0, %o2, %o0
 	sub		%o1, 1, %o1
@@ -224,295 +170,7 @@ FUNC(memmove)
 	 sub		%o0, 1, %o0
 
 	retl
-	 RETL_INSN
-
-#else /* FASTER_REVERSE */
-
-	add		%o1, %o2, %o1
-	add		%o0, %o2, %o0
-	bne		77f
-	 cmp		%o2, 15
-	bleu		91f
-	 andcc		%o1, 3, %g0
-	bne		70b
-3:
-	 andcc		%o1, 4, %g0
-
-	be		2f
-	 mov		%o2, %g1
-
-	ld		[%o1 - 4], %o4
-	sub		%g1, 4, %g1
-	st		%o4, [%o0 - 4]
-	sub		%o1, 4, %o1
-	sub		%o0, 4, %o0
-2:
-	andcc		%g1, 0xffffff80, %g7
-	be		3f
-	 andcc		%o0, 4, %g0
-
-	be		74f + 4
-5:
-	RMOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
-	RMOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
-	RMOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
-	RMOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
-	subcc		%g7, 128, %g7
-	sub		%o1, 128, %o1
-	bne		5b
-	 sub		%o0, 128, %o0
-3:
-	andcc		%g1, 0x70, %g7
-	be		72f
-	 andcc		%g1, 8, %g0
-
-	sethi		%hi(72f), %o5
-	srl		%g7, 1, %o4
-	add		%g7, %o4, %o4
-	sub		%o1, %g7, %o1
-	sub		%o5, %o4, %o5
-	jmpl		%o5 + %lo(72f), %g0
-	 sub		%o0, %g7, %o0
-
-71:	/* rmemcpy_table */
-	RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
-	RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
-	RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
-	RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
-	RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
-	RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
-	RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
-
-72:	/* rmemcpy_table_end */
-
-	be		73f
-	 andcc		%g1, 4, %g0
-
-	ldd		[%o1 - 0x08], %g2
-	sub		%o0, 8, %o0
-	sub		%o1, 8, %o1
-	st		%g2, [%o0]
-	st		%g3, [%o0 + 0x04]
-
-73:	/* rmemcpy_last7 */
-
-	be		1f
-	 andcc		%g1, 2, %g0
-
-	ld		[%o1 - 4], %g2
-	sub		%o1, 4, %o1
-	st		%g2, [%o0 - 4]
-	sub		%o0, 4, %o0
-1:
-	be		1f
-	 andcc		%g1, 1, %g0
-
-	lduh		[%o1 - 2], %g2
-	sub		%o1, 2, %o1
-	sth		%g2, [%o0 - 2]
-	sub		%o0, 2, %o0
-1:
-	be		1f
-	 nop
-
-	ldub		[%o1 - 1], %g2
-	stb		%g2, [%o0 - 1]
-1:
-	retl
- 	 RETL_INSN
-
-74:	/* rldd_std */
-	RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
-	RMOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
-	RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
-	RMOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
-	subcc		%g7, 128, %g7
-	sub		%o1, 128, %o1
-	bne		74b
-	 sub		%o0, 128, %o0
-
-	andcc		%g1, 0x70, %g7
-	be		72b
-	 andcc		%g1, 8, %g0
-
-	sethi		%hi(72b), %o5
-	srl		%g7, 1, %o4
-	add		%g7, %o4, %o4
-	sub		%o1, %g7, %o1
-	sub		%o5, %o4, %o5
-	jmpl		%o5 + %lo(72b), %g0
-	 sub		%o0, %g7, %o0
-
-75:	/* rshort_end */
-
-	and		%o2, 0xe, %o3
-2:
-	sethi		%hi(76f), %o5
-	sll		%o3, 3, %o4
-	sub		%o0, %o3, %o0
-	sub		%o5, %o4, %o5
-	sub		%o1, %o3, %o1
-	jmpl		%o5 + %lo(76f), %g0
-	 andcc		%o2, 1, %g0
-
-	RMOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
-	RMOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
-	RMOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
-	RMOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
-	RMOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
-	RMOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
-	RMOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
-
-76:	/* rshort_table_end */
-
-	be		1f
-	 nop
-	ldub		[%o1 - 1], %g2
-	stb		%g2, [%o0 - 1]
-1:
-	retl
- 	 RETL_INSN
-
-91:	/* rshort_aligned_end */
-
-	bne		75b
-	 andcc		%o2, 8, %g0
-
-	be		1f
-	 andcc		%o2, 4, %g0
-
-	ld		[%o1 - 0x08], %g2
-	ld		[%o1 - 0x04], %g3
-	sub		%o1, 8, %o1
-	st		%g2, [%o0 - 0x08]
-	st		%g3, [%o0 - 0x04]
-	sub		%o0, 8, %o0
-1:
-	b		73b
-	 mov		%o2, %g1
-
-77:	/* rnon_aligned */
-	cmp		%o2, 15
-	bleu		75b
-	 andcc		%o0, 3, %g0
-	be		64f
-	 andcc		%o0, 1, %g0
-	be		63f
-	 andcc		%o0, 2, %g0
-	ldub		[%o1 - 1], %g5
-	sub		%o1, 1, %o1
-	stb		%g5, [%o0 - 1]
-	sub		%o0, 1, %o0
-	be		64f
-	 sub		%o2, 1, %o2
-63:
-	ldub		[%o1 - 1], %g5
-	sub		%o1, 2, %o1
-	stb		%g5, [%o0 - 1]
-	sub		%o0, 2, %o0
-	ldub		[%o1], %g5
-	sub		%o2, 2, %o2
-	stb		%g5, [%o0]
-64:	
-	and		%o1, 3, %g2
-	and		%o1, -4, %o1
-	and		%o2, 0xc, %g3
-	add		%o1, 4, %o1
-	cmp		%g3, 4
-	sll		%g2, 3, %g4
-	mov		32, %g2
-	be		4f
-	 sub		%g2, %g4, %g7
-
-	blu		3f
-	 cmp		%g3, 8
-
-	be		2f
-	 srl		%o2, 2, %g3
-
-	ld		[%o1 - 4], %o3
-	add		%o0, -8, %o0
-	ld		[%o1 - 8], %o4
-	add		%o1, -16, %o1
-	b		7f
-	 add		%g3, 1, %g3
-2:
-	ld		[%o1 - 4], %o4
-	add		%o0, -4, %o0
-	ld		[%o1 - 8], %g1
-	add		%o1, -12, %o1
-	b		8f
-	 add		%g3, 2, %g3
-3:
-	ld		[%o1 - 4], %o5
-	add		%o0, -12, %o0
-	ld		[%o1 - 8], %o3
-	add		%o1, -20, %o1
-	b		6f
-	 srl		%o2, 2, %g3
-4:
-	ld		[%o1 - 4], %g1
-	srl		%o2, 2, %g3
-	ld		[%o1 - 8], %o5
-	add		%o1, -24, %o1
-	add		%o0, -16, %o0
-	add		%g3, -1, %g3
-
-	ld		[%o1 + 12], %o3
-5:
-	sll		%o5, %g4, %g2
-	srl		%g1, %g7, %g5
-	or		%g2, %g5, %g2
-	st		%g2, [%o0 + 12]
-6:
-	ld		[%o1 + 8], %o4
-	sll		%o3, %g4, %g2
-	srl		%o5, %g7, %g5
-	or		%g2, %g5, %g2
-	st		%g2, [%o0 + 8]
-7:
-	ld		[%o1 + 4], %g1
-	sll		%o4, %g4, %g2
-	srl		%o3, %g7, %g5
-	or		%g2, %g5, %g2
-	st		%g2, [%o0 + 4]
-8:
-	ld		[%o1], %o5
-	sll		%g1, %g4, %g2
-	srl		%o4, %g7, %g5
-	addcc		%g3, -4, %g3
-	or		%g2, %g5, %g2
-	add		%o1, -16, %o1
-	st		%g2, [%o0]
-	add		%o0, -16, %o0
-	bne,a		5b	
-	 ld		[%o1 + 12], %o3
-	sll		%o5, %g4, %g2
-	srl		%g1, %g7, %g5
-	srl		%g4, 3, %g3
-	or		%g2, %g5, %g2
-	add		%o1, %g3, %o1
-	andcc		%o2, 2, %g0
-	st		%g2, [%o0 + 12]
-	be		1f
-	 andcc		%o2, 1, %g0
-	
-	ldub		[%o1 + 15], %g5
-	add		%o1, -2, %o1
-	stb		%g5, [%o0 + 11]
-	add		%o0, -2, %o0
-	ldub		[%o1 + 16], %g5
-	stb		%g5, [%o0 + 12]
-1:
-	be		1f
-	 nop
-	ldub		[%o1 + 15], %g5
-	stb		%g5, [%o0 + 11]
-1:
-	retl
-	 RETL_INSN
-
-#endif /* FASTER_REVERSE */
+	 mov		%g7, %o0
 
 /* NOTE: This code is executed just for the cases,
          where %src (=%o1) & 3 is != 0.
@@ -546,7 +204,7 @@ FUNC(memmove)
 FUNC(memcpy)	/* %o0=dst %o1=src %o2=len */
 
 	sub		%o0, %o1, %o4
-	SETUP_RETL
+	mov		%o0, %g7
 9:
 	andcc		%o4, 3, %o5
 0:
@@ -569,7 +227,7 @@ FUNC(memcpy)	/* %o0=dst %o1=src %o2=len */
 	add		%o1, 4, %o1
 	add		%o0, 4, %o0
 2:
-	andcc		%g1, 0xffffff80, %g7
+	andcc		%g1, 0xffffff80, %g0
 	be		3f
 	 andcc		%o0, 4, %g0
 
@@ -579,22 +237,23 @@ FUNC(memcpy)	/* %o0=dst %o1=src %o2=len */
 	MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
 	MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
 	MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
-	subcc		%g7, 128, %g7
+	sub		%g1, 128, %g1
 	add		%o1, 128, %o1
-	bne		5b
+	cmp		%g1, 128
+	bge		5b
 	 add		%o0, 128, %o0
 3:
-	andcc		%g1, 0x70, %g7
+	andcc		%g1, 0x70, %g4
 	be		80f
 	 andcc		%g1, 8, %g0
 
 	sethi		%hi(80f), %o5
-	srl		%g7, 1, %o4
-	add		%g7, %o4, %o4
-	add		%o1, %g7, %o1
+	srl		%g4, 1, %o4
+	add		%g4, %o4, %o4
+	add		%o1, %g4, %o1
 	sub		%o5, %o4, %o5
 	jmpl		%o5 + %lo(80f), %g0
-	 add		%o0, %g7, %o0
+	 add		%o0, %g4, %o0
 
 79:	/* memcpy_table */
 
@@ -641,43 +300,28 @@ FUNC(memcpy)	/* %o0=dst %o1=src %o2=len */
 	stb		%g2, [%o0]
 1:
 	retl
- 	 RETL_INSN
+	 mov		%g7, %o0
 
 82:	/* ldd_std */
 	MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
 	MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
 	MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
 	MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
-	subcc		%g7, 128, %g7
+	subcc		%g1, 128, %g1
 	add		%o1, 128, %o1
-	bne		82b
+	cmp		%g1, 128
+	bge		82b
 	 add		%o0, 128, %o0
 
-#ifndef FASTER_ALIGNED
-
-	andcc		%g1, 0x70, %g7
-	be		80b
-	 andcc		%g1, 8, %g0
-
-	sethi		%hi(80b), %o5
-	srl		%g7, 1, %o4
-	add		%g7, %o4, %o4
-	add		%o1, %g7, %o1
-	sub		%o5, %o4, %o5
-	jmpl		%o5 + %lo(80b), %g0
-	 add		%o0, %g7, %o0
-
-#else /* FASTER_ALIGNED */
-
-	andcc		%g1, 0x70, %g7
+	andcc		%g1, 0x70, %g4
 	be		84f
 	 andcc		%g1, 8, %g0
 
 	sethi		%hi(84f), %o5
-	add		%o1, %g7, %o1
-	sub		%o5, %g7, %o5
+	add		%o1, %g4, %o1
+	sub		%o5, %g4, %o5
 	jmpl		%o5 + %lo(84f), %g0
-	 add		%o0, %g7, %o0
+	 add		%o0, %g4, %o0
 
 83:	/* amemcpy_table */
 
@@ -721,382 +365,132 @@ FUNC(memcpy)	/* %o0=dst %o1=src %o2=len */
 	stb		%g2, [%o0]
 1:
 	retl
- 	 RETL_INSN
-
-#endif /* FASTER_ALIGNED */
+	 mov		%g7, %o0
 
 86:	/* non_aligned */
 	cmp		%o2, 6
 	bleu		88f
+	 nop
 
-#ifdef FASTER_NONALIGNED
-
-	 cmp		%o2, 256
-	bcc		87f
-
-#endif /* FASTER_NONALIGNED */
-
-	 andcc		%o0, 3, %g0
+	save		%sp, -96, %sp
+	andcc		%i0, 3, %g0
 	be		61f
-	 andcc		%o0, 1, %g0
+	 andcc		%i0, 1, %g0
 	be		60f
-	 andcc		%o0, 2, %g0
+	 andcc		%i0, 2, %g0
 
-	ldub		[%o1], %g5
-	add		%o1, 1, %o1
-	stb		%g5, [%o0]
-	sub		%o2, 1, %o2
+	ldub		[%i1], %g5
+	add		%i1, 1, %i1
+	stb		%g5, [%i0]
+	sub		%i2, 1, %i2
 	bne		61f
-	 add		%o0, 1, %o0
+	 add		%i0, 1, %i0
 60:
-	ldub		[%o1], %g3
-	add		%o1, 2, %o1
-	stb		%g3, [%o0]
-	sub		%o2, 2, %o2
-	ldub		[%o1 - 1], %g3
-	add		%o0, 2, %o0
-	stb		%g3, [%o0 - 1]
+	ldub		[%i1], %g3
+	add		%i1, 2, %i1
+	stb		%g3, [%i0]
+	sub		%i2, 2, %i2
+	ldub		[%i1 - 1], %g3
+	add		%i0, 2, %i0
+	stb		%g3, [%i0 - 1]
 61:
-	and		%o1, 3, %g2
-	and		%o2, 0xc, %g3
-	and		%o1, -4, %o1
+	and		%i1, 3, %g2
+	and		%i2, 0xc, %g3
+	and		%i1, -4, %i1
 	cmp		%g3, 4
 	sll		%g2, 3, %g4
 	mov		32, %g2
 	be		4f
-	 sub		%g2, %g4, %g7
+	 sub		%g2, %g4, %l0
 	
 	blu		3f
 	 cmp		%g3, 0x8
 
 	be		2f
-	 srl		%o2, 2, %g3
+	 srl		%i2, 2, %g3
 
-	ld		[%o1], %o3
-	add		%o0, -8, %o0
-	ld		[%o1 + 4], %o4
+	ld		[%i1], %i3
+	add		%i0, -8, %i0
+	ld		[%i1 + 4], %i4
 	b		8f
 	 add		%g3, 1, %g3
 2:
-	ld		[%o1], %o4
-	add		%o0, -12, %o0
-	ld		[%o1 + 4], %o5
+	ld		[%i1], %i4
+	add		%i0, -12, %i0
+	ld		[%i1 + 4], %i5
 	add		%g3, 2, %g3
 	b		9f
-	 add		%o1, -4, %o1
+	 add		%i1, -4, %i1
 3:
-	ld		[%o1], %g1
-	add		%o0, -4, %o0
-	ld		[%o1 + 4], %o3
-	srl		%o2, 2, %g3
+	ld		[%i1], %g1
+	add		%i0, -4, %i0
+	ld		[%i1 + 4], %i3
+	srl		%i2, 2, %g3
 	b		7f
-	 add		%o1, 4, %o1
+	 add		%i1, 4, %i1
 4:
-	ld		[%o1], %o5
-	cmp		%o2, 7
-	ld		[%o1 + 4], %g1
-	srl		%o2, 2, %g3
+	ld		[%i1], %i5
+	cmp		%i2, 7
+	ld		[%i1 + 4], %g1
+	srl		%i2, 2, %g3
 	bleu		10f
-	 add		%o1, 8, %o1
+	 add		%i1, 8, %i1
 
-	ld		[%o1], %o3
+	ld		[%i1], %i3
 	add		%g3, -1, %g3
 5:
-	sll		%o5, %g4, %g2
-	srl		%g1, %g7, %g5
+	sll		%i5, %g4, %g2
+	srl		%g1, %l0, %g5
 	or		%g2, %g5, %g2
-	st		%g2, [%o0]
+	st		%g2, [%i0]
 7:
-	ld		[%o1 + 4], %o4
+	ld		[%i1 + 4], %i4
 	sll		%g1, %g4, %g2
-	srl		%o3, %g7, %g5
+	srl		%i3, %l0, %g5
 	or		%g2, %g5, %g2
-	st		%g2, [%o0 + 4]
+	st		%g2, [%i0 + 4]
 8:
-	ld		[%o1 + 8], %o5
-	sll		%o3, %g4, %g2
-	srl		%o4, %g7, %g5
+	ld		[%i1 + 8], %i5
+	sll		%i3, %g4, %g2
+	srl		%i4, %l0, %g5
 	or		%g2, %g5, %g2
-	st		%g2, [%o0 + 8]
+	st		%g2, [%i0 + 8]
 9:
-	ld		[%o1 + 12], %g1
-	sll		%o4, %g4, %g2
-	srl		%o5, %g7, %g5
+	ld		[%i1 + 12], %g1
+	sll		%i4, %g4, %g2
+	srl		%i5, %l0, %g5
 	addcc		%g3, -4, %g3
 	or		%g2, %g5, %g2
-	add		%o1, 16, %o1
-	st		%g2, [%o0 + 12]
-	add		%o0, 16, %o0
+	add		%i1, 16, %i1
+	st		%g2, [%i0 + 12]
+	add		%i0, 16, %i0
 	bne,a		5b
-	 ld		[%o1], %o3
+	 ld		[%i1], %i3
 10:
-	sll		%o5, %g4, %g2
-	srl		%g1, %g7, %g5
-	srl		%g7, 3, %g3
+	sll		%i5, %g4, %g2
+	srl		%g1, %l0, %g5
+	srl		%l0, 3, %g3
 	or		%g2, %g5, %g2
-	sub		%o1, %g3, %o1
-	andcc		%o2, 2, %g0
-	st		%g2, [%o0]
+	sub		%i1, %g3, %i1
+	andcc		%i2, 2, %g0
+	st		%g2, [%i0]
 	be		1f
-	 andcc		%o2, 1, %g0
-
-	ldub		[%o1], %g2
-	add		%o1, 2, %o1
-	stb		%g2, [%o0 + 4]
-	add		%o0, 2, %o0
-	ldub		[%o1 - 1], %g2
-	stb		%g2, [%o0 + 3]
+	 andcc		%i2, 1, %g0
+
+	ldub		[%i1], %g2
+	add		%i1, 2, %i1
+	stb		%g2, [%i0 + 4]
+	add		%i0, 2, %i0
+	ldub		[%i1 - 1], %g2
+	stb		%g2, [%i0 + 3]
 1:
 	be		1f
 	 nop
-	ldub		[%o1], %g2
-	stb		%g2, [%o0 + 4]
-1:
-	retl
-	 RETL_INSN
-
-#ifdef FASTER_NONALIGNED
-
-87:	/* faster_nonaligned */
-
-	andcc		%o1, 3, %g0
-	be		3f
-	 andcc		%o1, 1, %g0
-
-	be		4f
-	 andcc		%o1, 2, %g0
-
-	ldub		[%o1], %g2
-	add		%o1, 1, %o1
-	stb		%g2, [%o0]
-	sub		%o2, 1, %o2
-	bne		3f
-	 add		%o0, 1, %o0
-4:
-	lduh		[%o1], %g2
-	add		%o1, 2, %o1
-	srl		%g2, 8, %g3
-	sub		%o2, 2, %o2
-	stb		%g3, [%o0]
-	add		%o0, 2, %o0
-	stb		%g2, [%o0 - 1]
-3:
-	 andcc		%o1, 4, %g0
-
-	bne		2f
-	 cmp		%o5, 1
-
-	ld		[%o1], %o4
-	srl		%o4, 24, %g2
-	stb		%g2, [%o0]
-	srl		%o4, 16, %g3
-	stb		%g3, [%o0 + 1]
-	srl		%o4, 8, %g2
-	stb		%g2, [%o0 + 2]
-	sub		%o2, 4, %o2
-	stb		%o4, [%o0 + 3]
-	add		%o1, 4, %o1
-	add		%o0, 4, %o0
-2:
-	be		33f
-	 cmp		%o5, 2
-	be		32f
-	 sub		%o2, 4, %o2
-31:
-	ld		[%o1], %g2
-	add		%o1, 4, %o1
-	srl		%g2, 24, %g3
-	and		%o0, 7, %g5
-	stb		%g3, [%o0]
-	cmp		%g5, 7
-	sll		%g2, 8, %g1
-	add		%o0, 4, %o0
-	be		41f
-	 and		%o2, 0xffffffc0, %o3
-	ld		[%o0 - 7], %o4
-4:
-	SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
-	SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
-	SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
-	SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
-	subcc		%o3, 64, %o3
-	add		%o1, 64, %o1
-	bne		4b
-	 add		%o0, 64, %o0
-
-	andcc		%o2, 0x30, %o3
-	be,a		1f
-	 srl		%g1, 16, %g2
-4:
-	SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
-	subcc		%o3, 16, %o3
-	add		%o1, 16, %o1
-	bne		4b
-	 add		%o0, 16, %o0
-
-	srl		%g1, 16, %g2
-1:
-	st		%o4, [%o0 - 7]
-	sth		%g2, [%o0 - 3]
-	srl		%g1, 8, %g4
-	b		88f
-	 stb		%g4, [%o0 - 1]
-32:
-	ld		[%o1], %g2
-	add		%o1, 4, %o1
-	srl		%g2, 16, %g3
-	and		%o0, 7, %g5
-	sth		%g3, [%o0]
-	cmp		%g5, 6
-	sll		%g2, 16, %g1
-	add		%o0, 4, %o0
-	be		42f
-	 and		%o2, 0xffffffc0, %o3
-	ld		[%o0 - 6], %o4
-4:
-	SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
-	SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
-	SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
-	SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
-	subcc		%o3, 64, %o3
-	add		%o1, 64, %o1
-	bne		4b
-	 add		%o0, 64, %o0
-
-	andcc		%o2, 0x30, %o3
-	be,a		1f
-	 srl		%g1, 16, %g2
-4:
-	SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
-	subcc		%o3, 16, %o3
-	add		%o1, 16, %o1
-	bne		4b
-	 add		%o0, 16, %o0
-
-	srl		%g1, 16, %g2
-1:
-	st		%o4, [%o0 - 6]
-	b		88f
-	 sth		%g2, [%o0 - 2]
-33:
-	ld		[%o1], %g2
-	sub		%o2, 4, %o2
-	srl		%g2, 24, %g3
-	and		%o0, 7, %g5
-	stb		%g3, [%o0]
-	cmp		%g5, 5
-	srl		%g2, 8, %g4
-	sll		%g2, 24, %g1
-	sth		%g4, [%o0 + 1]
-	add		%o1, 4, %o1
-	be		43f
-	 and		%o2, 0xffffffc0, %o3
-
-	ld		[%o0 - 1], %o4
-	add		%o0, 4, %o0
-4:
-	SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
-	SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
-	SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
-	SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
-	subcc		%o3, 64, %o3
-	add		%o1, 64, %o1
-	bne		4b
-	 add		%o0, 64, %o0
-
-	andcc		%o2, 0x30, %o3
-	be,a		1f
-	 srl		%g1, 24, %g2
-4:
-	SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
-	subcc		%o3, 16, %o3
-	add		%o1, 16, %o1
-	bne		4b
-	 add		%o0, 16, %o0
-
-	srl		%g1, 24, %g2
-1:
-	st		%o4, [%o0 - 5]
-	b		88f
-	 stb		%g2, [%o0 - 1]
-41:
-	SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
-	SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
-	SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
-	SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
-	subcc		%o3, 64, %o3
-	add		%o1, 64, %o1
-	bne		41b
-	 add		%o0, 64, %o0
-	 
-	andcc		%o2, 0x30, %o3
-	be,a		1f
-	 srl		%g1, 16, %g2
-4:
-	SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
-	subcc		%o3, 16, %o3
-	add		%o1, 16, %o1
-	bne		4b
-	 add		%o0, 16, %o0
-
-	srl		%g1, 16, %g2
+	ldub		[%i1], %g2
+	stb		%g2, [%i0 + 4]
 1:
-	sth		%g2, [%o0 - 3]
-	srl		%g1, 8, %g4
-	b		88f
-	 stb		%g4, [%o0 - 1]
-43:
-	SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
-	SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
-	SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
-	SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
-	subcc		%o3, 64, %o3
-	add		%o1, 64, %o1
-	bne		43b
-	 add		%o0, 64, %o0
-
-	andcc		%o2, 0x30, %o3
-	be,a		1f
-	 srl		%g1, 24, %g2
-4:
-	SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
-	subcc		%o3, 16, %o3
-	add		%o1, 16, %o1
-	bne		4b
-	 add		%o0, 16, %o0
-
-	srl		%g1, 24, %g2
-1:
-	stb		%g2, [%o0 + 3]
-	b		88f
-	 add		%o0, 4, %o0
-42:
-	SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
-	SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
-	SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
-	SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
-	subcc		%o3, 64, %o3
-	add		%o1, 64, %o1
-	bne		42b
-	 add		%o0, 64, %o0
-	 
-	andcc		%o2, 0x30, %o3
-	be,a		1f
-	 srl		%g1, 16, %g2
-4:
-	SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
-	subcc		%o3, 16, %o3
-	add		%o1, 16, %o1
-	bne		4b
-	 add		%o0, 16, %o0
-
-	srl		%g1, 16, %g2
-1:
-	sth		%g2, [%o0 - 2]
-
-	/* Fall through */
-	 
-#endif /* FASTER_NONALIGNED */
+	ret
+	 restore	%g7, %g0, %o0
 
 88:	/* short_end */
 
@@ -1127,7 +521,7 @@ FUNC(memcpy)	/* %o0=dst %o1=src %o2=len */
 	stb		%g2, [%o0]
 1:
 	retl
- 	 RETL_INSN
+	 mov		%g7, %o0
 
 90:	/* short_aligned_end */
 	bne		88b
diff --git a/arch/sparc/lib/memmove.S b/arch/sparc/lib/memmove.S
index 97395802c23..b7f6334e159 100644
--- a/arch/sparc/lib/memmove.S
+++ b/arch/sparc/lib/memmove.S
@@ -4,11 +4,10 @@
  * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz)
  */
 
+#include <linux/linkage.h>
+
 	.text
-	.align		32
-	.globl		memmove
-	.type		memmove,#function
-memmove:		/* o0=dst o1=src o2=len */
+ENTRY(memmove) /* o0=dst o1=src o2=len */
 	mov		%o0, %g1
 	cmp		%o0, %o1
 	bleu,pt		%xcc, memcpy
@@ -28,4 +27,4 @@ memmove:		/* o0=dst o1=src o2=len */
 
 	retl
 	 mov		%g1, %o0
-	.size		memmove, .-memmove
+ENDPROC(memmove)
diff --git a/arch/sparc/lib/mul.S b/arch/sparc/lib/mul.S
deleted file mode 100644
index c45470d0b0c..00000000000
--- a/arch/sparc/lib/mul.S
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * mul.S:       This routine was taken from glibc-1.09 and is covered
- *              by the GNU Library General Public License Version 2.
- */
-
-/*
- * Signed multiply, from Appendix E of the Sparc Version 8
- * Architecture Manual.
- */
-
-/*
- * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of
- * the 64-bit product).
- *
- * This code optimizes short (less than 13-bit) multiplies.
- */
-
-	.globl .mul
-	.globl _Mul
-.mul:
-_Mul:	/* needed for export */
-	mov	%o0, %y		! multiplier -> Y
-	andncc	%o0, 0xfff, %g0	! test bits 12..31
-	be	Lmul_shortway	! if zero, can do it the short way
-	 andcc	%g0, %g0, %o4	! zero the partial product and clear N and V
-
-	/*
-	 * Long multiply.  32 steps, followed by a final shift step.
-	 */
-	mulscc	%o4, %o1, %o4	! 1
-	mulscc	%o4, %o1, %o4	! 2
-	mulscc	%o4, %o1, %o4	! 3
-	mulscc	%o4, %o1, %o4	! 4
-	mulscc	%o4, %o1, %o4	! 5
-	mulscc	%o4, %o1, %o4	! 6
-	mulscc	%o4, %o1, %o4	! 7
-	mulscc	%o4, %o1, %o4	! 8
-	mulscc	%o4, %o1, %o4	! 9
-	mulscc	%o4, %o1, %o4	! 10
-	mulscc	%o4, %o1, %o4	! 11
-	mulscc	%o4, %o1, %o4	! 12
-	mulscc	%o4, %o1, %o4	! 13
-	mulscc	%o4, %o1, %o4	! 14
-	mulscc	%o4, %o1, %o4	! 15
-	mulscc	%o4, %o1, %o4	! 16
-	mulscc	%o4, %o1, %o4	! 17
-	mulscc	%o4, %o1, %o4	! 18
-	mulscc	%o4, %o1, %o4	! 19
-	mulscc	%o4, %o1, %o4	! 20
-	mulscc	%o4, %o1, %o4	! 21
-	mulscc	%o4, %o1, %o4	! 22
-	mulscc	%o4, %o1, %o4	! 23
-	mulscc	%o4, %o1, %o4	! 24
-	mulscc	%o4, %o1, %o4	! 25
-	mulscc	%o4, %o1, %o4	! 26
-	mulscc	%o4, %o1, %o4	! 27
-	mulscc	%o4, %o1, %o4	! 28
-	mulscc	%o4, %o1, %o4	! 29
-	mulscc	%o4, %o1, %o4	! 30
-	mulscc	%o4, %o1, %o4	! 31
-	mulscc	%o4, %o1, %o4	! 32
-	mulscc	%o4, %g0, %o4	! final shift
-
-	! If %o0 was negative, the result is
-	!	(%o0 * %o1) + (%o1 << 32))
-	! We fix that here.
-
-#if 0
-	tst	%o0
-	bge	1f
-	 rd	%y, %o0
-
-	! %o0 was indeed negative; fix upper 32 bits of result by subtracting 
-	! %o1 (i.e., return %o4 - %o1 in %o1).
-	retl
-	 sub	%o4, %o1, %o1
-
-1:
-	retl
-	 mov	%o4, %o1
-#else
-	/* Faster code adapted from tege@sics.se's code for umul.S.  */
-	sra	%o0, 31, %o2	! make mask from sign bit
-	and	%o1, %o2, %o2	! %o2 = 0 or %o1, depending on sign of %o0
-	rd	%y, %o0		! get lower half of product
-	retl
-	 sub	%o4, %o2, %o1	! subtract compensation 
-				!  and put upper half in place
-#endif
-
-Lmul_shortway:
-	/*
-	 * Short multiply.  12 steps, followed by a final shift step.
-	 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
-	 * but there is no problem with %o0 being negative (unlike above).
-	 */
-	mulscc	%o4, %o1, %o4	! 1
-	mulscc	%o4, %o1, %o4	! 2
-	mulscc	%o4, %o1, %o4	! 3
-	mulscc	%o4, %o1, %o4	! 4
-	mulscc	%o4, %o1, %o4	! 5
-	mulscc	%o4, %o1, %o4	! 6
-	mulscc	%o4, %o1, %o4	! 7
-	mulscc	%o4, %o1, %o4	! 8
-	mulscc	%o4, %o1, %o4	! 9
-	mulscc	%o4, %o1, %o4	! 10
-	mulscc	%o4, %o1, %o4	! 11
-	mulscc	%o4, %o1, %o4	! 12
-	mulscc	%o4, %g0, %o4	! final shift
-
-	/*
-	 *  %o4 has 20 of the bits that should be in the low part of the
-	 * result; %y has the bottom 12 (as %y's top 12).  That is:
-	 *
-	 *	  %o4		    %y
-	 * +----------------+----------------+
-	 * | -12- |   -20-  | -12- |   -20-  |
-	 * +------(---------+------)---------+
-	 *  --hi-- ----low-part----
-	 *
-	 * The upper 12 bits of %o4 should be sign-extended to form the
-	 * high part of the product (i.e., highpart = %o4 >> 20).
-	 */
-
-	rd	%y, %o5
-	sll	%o4, 12, %o0	! shift middle bits left 12
-	srl	%o5, 20, %o5	! shift low bits right 20, zero fill at left
-	or	%o5, %o0, %o0	! construct low part of result
-	retl
-	 sra	%o4, 20, %o1	! ... and extract high part of result
-
-	.globl	.mul_patch
-.mul_patch:
-	smul	%o0, %o1, %o0
-	retl
-	 rd	%y, %o1
-	nop
diff --git a/arch/sparc/lib/muldi3.S b/arch/sparc/lib/muldi3.S
index 7f17872d060..9794939d1c1 100644
--- a/arch/sparc/lib/muldi3.S
+++ b/arch/sparc/lib/muldi3.S
@@ -63,12 +63,12 @@ __muldi3:
 	rd  %y, %o1
 	mov  %o1, %l3
 	mov  %i1, %o0
-	call  .umul
 	mov  %i2, %o1
+	umul %o0, %o1, %o0
 	mov  %o0, %l0
 	mov  %i0, %o0
-	call  .umul
 	mov  %i3, %o1
+	umul %o0, %o1, %o0
 	add  %l0, %o0, %l0
 	mov  %l2, %i0
 	add  %l2, %l0, %i0
diff --git a/arch/sparc/lib/rem.S b/arch/sparc/lib/rem.S
deleted file mode 100644
index 42fb8625281..00000000000
--- a/arch/sparc/lib/rem.S
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * rem.S:       This routine was taken from glibc-1.09 and is covered
- *              by the GNU Library General Public License Version 2.
- */
-
-
-/* This file is generated from divrem.m4; DO NOT EDIT! */
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- *  .rem	name of function to generate
- *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1
- *  true		true=true => signed; true=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top decade of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-
-
-	.globl .rem
-	.globl _Rem
-.rem:
-_Rem:	/* needed for export */
-	! compute sign of result; if neither is negative, no problem
-	orcc	%o1, %o0, %g0	! either negative?
-	bge	2f			! no, go do the divide
-	 mov	%o0, %g2	! compute sign in any case
-
-	tst	%o1
-	bge	1f
-	 tst	%o0
-	! %o1 is definitely negative; %o0 might also be negative
-	bge	2f			! if %o0 not negative...
-	 sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
-1:	! %o0 is negative, %o1 is nonnegative
-	sub	%g0, %o0, %o0	! make %o0 nonnegative
-2:
-
-	! Ready to divide.  Compute size of quotient; scale comparand.
-	orcc	%o1, %g0, %o5
-	bne	1f
-	 mov	%o0, %o3
-
-		! Divide by zero trap.  If it returns, return 0 (about as
-		! wrong as possible, but that is what SunOS does...).
-		ta	ST_DIV0
-		retl
-		 clr	%o0
-
-1:
-	cmp	%o3, %o5			! if %o1 exceeds %o0, done
-	blu	Lgot_result		! (and algorithm fails otherwise)
-	 clr	%o2
-
-	sethi	%hi(1 << (32 - 4 - 1)), %g1
-
-	cmp	%o3, %g1
-	blu	Lnot_really_big
-	 clr	%o4
-
-	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
-	! as our usual N-at-a-shot divide step will cause overflow and havoc.
-	! The number of bits in the result here is N*ITER+SC, where SC <= N.
-	! Compute ITER in an unorthodox manner: know we need to shift V into
-	! the top decade: so do not even bother to compare to R.
-	1:
-		cmp	%o5, %g1
-		bgeu	3f
-		 mov	1, %g7
-
-		sll	%o5, 4, %o5
-
-		b	1b
-		 add	%o4, 1, %o4
-
-	! Now compute %g7.
-	2:
-		addcc	%o5, %o5, %o5
-
-		bcc	Lnot_too_big
-		 add	%g7, 1, %g7
-
-		! We get here if the %o1 overflowed while shifting.
-		! This means that %o3 has the high-order bit set.
-		! Restore %o5 and subtract from %o3.
-		sll	%g1, 4, %g1	! high order bit
-		srl	%o5, 1, %o5		! rest of %o5
-		add	%o5, %g1, %o5
-
-		b	Ldo_single_div
-		 sub	%g7, 1, %g7
-
-	Lnot_too_big:
-	3:
-		cmp	%o5, %o3
-		blu	2b
-		 nop
-
-		be	Ldo_single_div
-		 nop
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	! %o5 > %o3: went too far: back up 1 step
-	!	srl	%o5, 1, %o5
-	!	dec	%g7
-	! do single-bit divide steps
-	!
-	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
-	! first divide step without thinking.  BUT, the others are conditional,
-	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
-	! order bit set in the first step, just falling into the regular
-	! division loop will mess up the first time around.
-	! So we unroll slightly...
-	Ldo_single_div:
-		subcc	%g7, 1, %g7
-		bl	Lend_regular_divide
-		 nop
-
-		sub	%o3, %o5, %o3
-		mov	1, %o2
-
-		b	Lend_single_divloop
-		 nop
-	Lsingle_divloop:
-		sll	%o2, 1, %o2
-
-		bl	1f
-		 srl	%o5, 1, %o5
-		! %o3 >= 0
-		sub	%o3, %o5, %o3
-
-		b	2f
-		 add	%o2, 1, %o2
-	1:	! %o3 < 0
-		add	%o3, %o5, %o3
-		sub	%o2, 1, %o2
-	2:
-	Lend_single_divloop:
-		subcc	%g7, 1, %g7
-		bge	Lsingle_divloop
-		 tst	%o3
-
-		b,a	Lend_regular_divide
-
-Lnot_really_big:
-1:
-	sll	%o5, 4, %o5
-	cmp	%o5, %o3
-	bleu	1b
-	 addcc	%o4, 1, %o4
-	be	Lgot_result
-	 sub	%o4, 1, %o4
-
-	tst	%o3	! set up for initial iteration
-Ldivloop:
-	sll	%o2, 4, %o2
-		! depth 1, accumulated bits 0
-	bl	L.1.16
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 2, accumulated bits 1
-	bl	L.2.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 3
-	bl	L.3.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 7
-	bl	L.4.23
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-
-	b	9f
-	 add	%o2, (7*2+1), %o2
-	
-L.4.23:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2-1), %o2
-	
-L.3.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 5
-	bl	L.4.21
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2+1), %o2
-	
-L.4.21:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2-1), %o2
-	
-L.2.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 1
-	bl	L.3.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 3
-	bl	L.4.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2+1), %o2
-
-L.4.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2-1), %o2
-
-L.3.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 1
-	bl	L.4.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2+1), %o2
-
-L.4.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2-1), %o2
-
-L.1.16:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 2, accumulated bits -1
-	bl	L.2.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -1
-	bl	L.3.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -1
-	bl	L.4.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2+1), %o2
-
-L.4.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2-1), %o2
-
-L.3.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -3
-	bl	L.4.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2+1), %o2
-
-L.4.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2-1), %o2
-
-L.2.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -3
-	bl	L.3.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -5
-	bl	L.4.11
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2+1), %o2
-
-L.4.11:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2-1), %o2
-
-
-L.3.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -7
-	bl	L.4.9
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2+1), %o2
-
-L.4.9:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2-1), %o2
-
-	9:
-Lend_regular_divide:
-	subcc	%o4, 1, %o4
-	bge	Ldivloop
-	 tst	%o3
-
-	bl,a	Lgot_result
-	! non-restoring fixup here (one instruction only!)
-	add	%o3, %o1, %o3
-
-Lgot_result:
-	! check to see if answer should be < 0
-	tst	%g2
-	bl,a	1f
-	 sub %g0, %o3, %o3
-1:
-	retl
-	 mov %o3, %o0
-
-	.globl	.rem_patch
-.rem_patch:
-	sra	%o0, 0x1f, %o4
-	wr	%o4, 0x0, %y
-	nop
-	nop
-	nop
-	sdivcc	%o0, %o1, %o2
-	bvs,a	1f
-	 xnor	%o2, %g0, %o2
-1:	smul	%o2, %o1, %o2
-	retl
-	 sub	%o0, %o2, %o0
-	nop
diff --git a/arch/sparc/lib/rwsem_32.S b/arch/sparc/lib/rwsem_32.S
deleted file mode 100644
index 9675268e7fd..00000000000
--- a/arch/sparc/lib/rwsem_32.S
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Assembly part of rw semaphores.
- *
- * Copyright (C) 1999 Jakub Jelinek (jakub@redhat.com)
- */
-
-#include <asm/ptrace.h>
-#include <asm/psr.h>
-
-	.section .sched.text, "ax"
-	.align	4
-
-	.globl		___down_read
-___down_read:
-	rd		%psr, %g3
-	nop
-	nop
-	nop
-	or		%g3, PSR_PIL, %g7
-	wr		%g7, 0, %psr
-	nop
-	nop
-	nop
-#ifdef CONFIG_SMP
-1:	ldstub		[%g1 + 4], %g7
-	tst		%g7
-	bne		1b
-	 ld		[%g1], %g7
-	sub		%g7, 1, %g7
-	st		%g7, [%g1]
-	stb		%g0, [%g1 + 4]
-#else
-	ld		[%g1], %g7
-	sub		%g7, 1, %g7
-	st		%g7, [%g1]
-#endif
-	wr		%g3, 0, %psr
-	add		%g7, 1, %g7
-	nop
-	nop
-	subcc		%g7, 1, %g7
-	bneg		3f
-	 nop
-2:	jmpl		%o7, %g0
-	 mov		%g4, %o7
-3:	save		%sp, -64, %sp
-	mov		%g1, %l1
-	mov		%g4, %l4
-	bcs		4f
-	 mov		%g5, %l5
-	call		down_read_failed
-	 mov		%l1, %o0
-	mov		%l1, %g1
-	mov		%l4, %g4
-	ba		___down_read
-	 restore	%l5, %g0, %g5
-4:	call		down_read_failed_biased
-	 mov		%l1, %o0
-	mov		%l1, %g1
-	mov		%l4, %g4
-	ba		2b
-	 restore	%l5, %g0, %g5
-
-	.globl		___down_write
-___down_write:
-	rd		%psr, %g3
-	nop
-	nop
-	nop
-	or		%g3, PSR_PIL, %g7
-	wr		%g7, 0, %psr
-	sethi		%hi(0x01000000), %g2
-	nop
-	nop
-#ifdef CONFIG_SMP
-1:	ldstub		[%g1 + 4], %g7
-	tst		%g7
-	bne		1b
-	 ld		[%g1], %g7
-	sub		%g7, %g2, %g7
-	st		%g7, [%g1]
-	stb		%g0, [%g1 + 4]
-#else
-	ld		[%g1], %g7
-	sub		%g7, %g2, %g7
-	st		%g7, [%g1]
-#endif
-	wr		%g3, 0, %psr
-	add		%g7, %g2, %g7
-	nop
-	nop
-	subcc		%g7, %g2, %g7
-	bne		3f
-	 nop
-2:	jmpl		%o7, %g0
-	 mov		%g4, %o7
-3:	save		%sp, -64, %sp
-	mov		%g1, %l1
-	mov		%g4, %l4
-	bcs		4f
-	 mov		%g5, %l5
-	call		down_write_failed
-	 mov		%l1, %o0
-	mov		%l1, %g1
-	mov		%l4, %g4
-	ba		___down_write
-	 restore	%l5, %g0, %g5
-4:	call		down_write_failed_biased
-	 mov		%l1, %o0
-	mov		%l1, %g1
-	mov		%l4, %g4
-	ba		2b
-	 restore	%l5, %g0, %g5
-
-	.text
-	.globl		___up_read
-___up_read:
-	rd		%psr, %g3
-	nop
-	nop
-	nop
-	or		%g3, PSR_PIL, %g7
-	wr		%g7, 0, %psr
-	nop
-	nop
-	nop
-#ifdef CONFIG_SMP
-1:	ldstub		[%g1 + 4], %g7
-	tst		%g7
-	bne		1b
-	 ld		[%g1], %g7
-	add		%g7, 1, %g7
-	st		%g7, [%g1]
-	stb		%g0, [%g1 + 4]
-#else
-	ld		[%g1], %g7
-	add		%g7, 1, %g7
-	st		%g7, [%g1]
-#endif
-	wr		%g3, 0, %psr
-	nop
-	nop
-	nop
-	cmp		%g7, 0
-	be		3f
-	 nop
-2:	jmpl		%o7, %g0
-	 mov		%g4, %o7
-3:	save		%sp, -64, %sp
-	mov		%g1, %l1
-	mov		%g4, %l4
-	mov		%g5, %l5
-	clr		%o1
-	call		__rwsem_wake
-	 mov		%l1, %o0
-	mov		%l1, %g1
-	mov		%l4, %g4
-	ba		2b
-	 restore	%l5, %g0, %g5
-
-	.globl		___up_write
-___up_write:
-	rd		%psr, %g3
-	nop
-	nop
-	nop
-	or		%g3, PSR_PIL, %g7
-	wr		%g7, 0, %psr
-	sethi		%hi(0x01000000), %g2
-	nop
-	nop
-#ifdef CONFIG_SMP
-1:	ldstub		[%g1 + 4], %g7
-	tst		%g7
-	bne		1b
-	 ld		[%g1], %g7
-	add		%g7, %g2, %g7
-	st		%g7, [%g1]
-	stb		%g0, [%g1 + 4]
-#else
-	ld		[%g1], %g7
-	add		%g7, %g2, %g7
-	st		%g7, [%g1]
-#endif
-	wr		%g3, 0, %psr
-	sub		%g7, %g2, %g7
-	nop
-	nop
-	addcc		%g7, %g2, %g7
-	bcs		3f
-	 nop
-2:	jmpl		%o7, %g0
-	 mov		%g4, %o7
-3:	save		%sp, -64, %sp
-	mov		%g1, %l1
-	mov		%g4, %l4
-	mov		%g5, %l5
-	mov		%g7, %o1
-	call		__rwsem_wake
-	 mov		%l1, %o0
-	mov		%l1, %g1
-	mov		%l4, %g4
-	ba		2b
-	 restore	%l5, %g0, %g5
diff --git a/arch/sparc/lib/sdiv.S b/arch/sparc/lib/sdiv.S
deleted file mode 100644
index f0a0d4e4db7..00000000000
--- a/arch/sparc/lib/sdiv.S
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * sdiv.S:      This routine was taken from glibc-1.09 and is covered
- *              by the GNU Library General Public License Version 2.
- */
-
-
-/* This file is generated from divrem.m4; DO NOT EDIT! */
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- *  .div	name of function to generate
- *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1
- *  true		true=true => signed; true=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top decade of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-
-
-	.globl .div
-	.globl _Div
-.div:
-_Div:	/* needed for export */
-	! compute sign of result; if neither is negative, no problem
-	orcc	%o1, %o0, %g0	! either negative?
-	bge	2f			! no, go do the divide
-	 xor	%o1, %o0, %g2	! compute sign in any case
-
-	tst	%o1
-	bge	1f
-	 tst	%o0
-	! %o1 is definitely negative; %o0 might also be negative
-	bge	2f			! if %o0 not negative...
-	 sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
-1:	! %o0 is negative, %o1 is nonnegative
-	sub	%g0, %o0, %o0	! make %o0 nonnegative
-2:
-
-	! Ready to divide.  Compute size of quotient; scale comparand.
-	orcc	%o1, %g0, %o5
-	bne	1f
-	 mov	%o0, %o3
-
-		! Divide by zero trap.  If it returns, return 0 (about as
-		! wrong as possible, but that is what SunOS does...).
-		ta	ST_DIV0
-		retl
-		 clr	%o0
-
-1:
-	cmp	%o3, %o5			! if %o1 exceeds %o0, done
-	blu	Lgot_result		! (and algorithm fails otherwise)
-	 clr	%o2
-
-	sethi	%hi(1 << (32 - 4 - 1)), %g1
-
-	cmp	%o3, %g1
-	blu	Lnot_really_big
-	 clr	%o4
-
-	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
-	! as our usual N-at-a-shot divide step will cause overflow and havoc.
-	! The number of bits in the result here is N*ITER+SC, where SC <= N.
-	! Compute ITER in an unorthodox manner: know we need to shift V into
-	! the top decade: so do not even bother to compare to R.
-	1:
-		cmp	%o5, %g1
-		bgeu	3f
-		 mov	1, %g7
-
-		sll	%o5, 4, %o5
-
-		b	1b
-		 add	%o4, 1, %o4
-
-	! Now compute %g7.
-	2:
-		addcc	%o5, %o5, %o5
-		bcc	Lnot_too_big
-		 add	%g7, 1, %g7
-
-		! We get here if the %o1 overflowed while shifting.
-		! This means that %o3 has the high-order bit set.
-		! Restore %o5 and subtract from %o3.
-		sll	%g1, 4, %g1	! high order bit
-		srl	%o5, 1, %o5		! rest of %o5
-		add	%o5, %g1, %o5
-
-		b	Ldo_single_div
-		 sub	%g7, 1, %g7
-
-	Lnot_too_big:
-	3:
-		cmp	%o5, %o3
-		blu	2b
-		 nop
-
-		be	Ldo_single_div
-		 nop
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	! %o5 > %o3: went too far: back up 1 step
-	!	srl	%o5, 1, %o5
-	!	dec	%g7
-	! do single-bit divide steps
-	!
-	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
-	! first divide step without thinking.  BUT, the others are conditional,
-	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
-	! order bit set in the first step, just falling into the regular
-	! division loop will mess up the first time around.
-	! So we unroll slightly...
-	Ldo_single_div:
-		subcc	%g7, 1, %g7
-		bl	Lend_regular_divide
-		 nop
-
-		sub	%o3, %o5, %o3
-		mov	1, %o2
-
-		b	Lend_single_divloop
-		 nop
-	Lsingle_divloop:
-		sll	%o2, 1, %o2
-
-		bl	1f
-		 srl	%o5, 1, %o5
-		! %o3 >= 0
-		sub	%o3, %o5, %o3
-
-		b	2f
-		 add	%o2, 1, %o2
-	1:	! %o3 < 0
-		add	%o3, %o5, %o3
-		sub	%o2, 1, %o2
-	2:
-	Lend_single_divloop:
-		subcc	%g7, 1, %g7
-		bge	Lsingle_divloop
-		 tst	%o3
-
-		b,a	Lend_regular_divide
-
-Lnot_really_big:
-1:
-	sll	%o5, 4, %o5
-	cmp	%o5, %o3
-	bleu	1b
-	 addcc	%o4, 1, %o4
-
-	be	Lgot_result
-	 sub	%o4, 1, %o4
-
-	tst	%o3	! set up for initial iteration
-Ldivloop:
-	sll	%o2, 4, %o2
-		! depth 1, accumulated bits 0
-	bl	L.1.16
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 2, accumulated bits 1
-	bl	L.2.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 3
-	bl	L.3.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 7
-	bl	L.4.23
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2+1), %o2
-
-L.4.23:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2-1), %o2
-
-L.3.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 5
-	bl	L.4.21
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2+1), %o2
-
-L.4.21:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2-1), %o2
-
-L.2.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 1
-	bl	L.3.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 3
-	bl	L.4.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2+1), %o2
-
-L.4.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2-1), %o2
-	
-	
-L.3.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 1
-	bl	L.4.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2+1), %o2
-
-L.4.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2-1), %o2
-
-L.1.16:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 2, accumulated bits -1
-	bl	L.2.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -1
-	bl	L.3.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -1
-	bl	L.4.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2+1), %o2
-
-L.4.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2-1), %o2
-
-L.3.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -3
-	bl	L.4.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2+1), %o2
-
-L.4.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2-1), %o2
-
-L.2.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -3
-	bl	L.3.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -5
-	bl	L.4.11
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2+1), %o2
-
-L.4.11:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2-1), %o2
-
-L.3.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -7
-	bl	L.4.9
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2+1), %o2
-
-L.4.9:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2-1), %o2
-
-	9:
-Lend_regular_divide:
-	subcc	%o4, 1, %o4
-	bge	Ldivloop
-	 tst	%o3
-
-	bl,a	Lgot_result
-	! non-restoring fixup here (one instruction only!)
-	sub	%o2, 1, %o2
-
-Lgot_result:
-	! check to see if answer should be < 0
-	tst	%g2
-	bl,a	1f
-	 sub %g0, %o2, %o2
-1:
-	retl
-	 mov %o2, %o0
-
-	.globl	.div_patch
-.div_patch:
-	sra	%o0, 0x1f, %o2
-	wr	%o2, 0x0, %y
-	nop
-	nop
-	nop
-	sdivcc	%o0, %o1, %o0
-	bvs,a	1f
-	 xnor	%o0, %g0, %o0
-1:	retl
-	 nop
diff --git a/arch/sparc/lib/strlen_user_32.S b/arch/sparc/lib/strlen_user_32.S
deleted file mode 100644
index 8c8a371df3c..00000000000
--- a/arch/sparc/lib/strlen_user_32.S
+++ /dev/null
@@ -1,109 +0,0 @@
-/* strlen_user.S: Sparc optimized strlen_user code
- *
- * Return length of string in userspace including terminating 0
- * or 0 for error
- *
- * Copyright (C) 1991,1996 Free Software Foundation
- * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
- * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- */
-
-#define LO_MAGIC 0x01010101
-#define HI_MAGIC 0x80808080
-
-10:
-	ldub	[%o0], %o5
-	cmp	%o5, 0
-	be	1f
-	 add	%o0, 1, %o0
-	andcc	%o0, 3, %g0
-	be	4f
-	 or	%o4, %lo(HI_MAGIC), %o3
-11:
-	ldub	[%o0], %o5
-	cmp	%o5, 0
-	be	2f
-	 add	%o0, 1, %o0
-	andcc	%o0, 3, %g0
-	be	5f
-	 sethi	%hi(LO_MAGIC), %o4
-12:
-	ldub	[%o0], %o5
-	cmp	%o5, 0
-	be	3f
-	 add	%o0, 1, %o0
-	b	13f
-	 or	%o4, %lo(LO_MAGIC), %o2
-1:
-	retl
-	 mov	1, %o0
-2:
-	retl
-	 mov	2, %o0
-3:
-	retl
-	 mov	3, %o0
-
-	.align 4
-	.global __strlen_user, __strnlen_user
-__strlen_user:
-	sethi	%hi(32768), %o1
-__strnlen_user:
-	mov	%o1, %g1
-	mov	%o0, %o1
-	andcc	%o0, 3, %g0
-	bne	10b
-	 sethi	%hi(HI_MAGIC), %o4
-	or	%o4, %lo(HI_MAGIC), %o3
-4:
-	sethi	%hi(LO_MAGIC), %o4
-5:
-	or	%o4, %lo(LO_MAGIC), %o2
-13:
-	ld	[%o0], %o5
-2:
-	sub	%o5, %o2, %o4
-	andcc	%o4, %o3, %g0
-	bne	82f
-	 add	%o0, 4, %o0
-	sub	%o0, %o1, %g2
-81:	cmp	%g2, %g1
-	blu	13b
-	 mov	%o0, %o4
-	ba,a	1f
-
-	/* Check every byte. */
-82:	srl	%o5, 24, %g5
-	andcc	%g5, 0xff, %g0
-	be	1f
-	 add	%o0, -3, %o4
-	srl	%o5, 16, %g5
-	andcc	%g5, 0xff, %g0
-	be	1f
-	 add	%o4, 1, %o4
-	srl	%o5, 8, %g5
-	andcc	%g5, 0xff, %g0
-	be	1f
-	 add	%o4, 1, %o4
-	andcc	%o5, 0xff, %g0
-	bne	81b
-	 sub	%o0, %o1, %g2
-
-	add	%o4, 1, %o4
-1:
-	retl
-	 sub	%o4, %o1, %o0
-
-	.section .fixup,#alloc,#execinstr
-	.align	4
-9:
-	retl
-	 clr	%o0
-
-	.section __ex_table,#alloc
-	.align	4
-
-	.word	10b, 9b
-	.word	11b, 9b
-	.word	12b, 9b
-	.word	13b, 9b
diff --git a/arch/sparc/lib/strlen_user_64.S b/arch/sparc/lib/strlen_user_64.S
deleted file mode 100644
index 114ed111e25..00000000000
--- a/arch/sparc/lib/strlen_user_64.S
+++ /dev/null
@@ -1,95 +0,0 @@
-/* strlen_user.S: Sparc64 optimized strlen_user code
- *
- * Return length of string in userspace including terminating 0
- * or 0 for error
- *
- * Copyright (C) 1991,1996 Free Software Foundation
- * Copyright (C) 1996,1999 David S. Miller (davem@redhat.com)
- * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- */
-
-#include <asm/asi.h>
-
-#define LO_MAGIC 0x01010101
-#define HI_MAGIC 0x80808080
-
-	.align 4
-	.global __strlen_user, __strnlen_user
-__strlen_user:
-	sethi	%hi(32768), %o1
-__strnlen_user:	
-	mov	%o1, %g1
-	mov	%o0, %o1
-	andcc	%o0, 3, %g0
-	be,pt	%icc, 9f
-	 sethi	%hi(HI_MAGIC), %o4
-10:	lduba	[%o0] %asi, %o5
-	brz,pn	%o5, 21f
-	 add	%o0, 1, %o0
-	andcc	%o0, 3, %g0
-	be,pn	%icc, 4f
-	 or	%o4, %lo(HI_MAGIC), %o3
-11:	lduba	[%o0] %asi, %o5
-	brz,pn	%o5, 22f
-	 add	%o0, 1, %o0
-	andcc	%o0, 3, %g0
-	be,pt	%icc, 13f
-	 srl	%o3, 7, %o2
-12:	lduba	[%o0] %asi, %o5
-	brz,pn	%o5, 23f
-	 add	%o0, 1, %o0
-	ba,pt	%icc, 2f
-15:	 lda	[%o0] %asi, %o5
-9:	or	%o4, %lo(HI_MAGIC), %o3
-4:	srl	%o3, 7, %o2
-13:	lda	[%o0] %asi, %o5
-2:	sub	%o5, %o2, %o4
-	andcc	%o4, %o3, %g0
-	bne,pn	%icc, 82f
-	 add	%o0, 4, %o0
-	sub	%o0, %o1, %g2
-81:	cmp	%g2, %g1
-	blu,pt	%icc, 13b
-	 mov	%o0, %o4
-	ba,a,pt	%xcc, 1f
-
-	/* Check every byte. */
-82:	srl	%o5, 24, %g7
-	andcc	%g7, 0xff, %g0
-	be,pn	%icc, 1f
-	 add	%o0, -3, %o4
-	srl	%o5, 16, %g7
-	andcc	%g7, 0xff, %g0
-	be,pn	%icc, 1f
-	 add	%o4, 1, %o4
-	srl	%o5, 8, %g7
-	andcc	%g7, 0xff, %g0
-	be,pn	%icc, 1f
-	 add	%o4, 1, %o4
-	andcc	%o5, 0xff, %g0
-	bne,pt	%icc, 81b
-	 sub	%o0, %o1, %g2
-	add	%o4, 1, %o4
-1:	retl
-	 sub	%o4, %o1, %o0
-21:	retl
-	 mov	1, %o0
-22:	retl
-	 mov	2, %o0
-23:	retl
-	 mov	3, %o0
-
-        .section .fixup,#alloc,#execinstr
-        .align  4
-30:
-        retl
-         clr    %o0
-
-	.section __ex_table,"a"
-	.align	4
-
-	.word	10b, 30b
-	.word	11b, 30b
-	.word	12b, 30b
-	.word	15b, 30b
-	.word	13b, 30b
diff --git a/arch/sparc/lib/strncmp_32.S b/arch/sparc/lib/strncmp_32.S
index 494ec664537..c0d1b568c1c 100644
--- a/arch/sparc/lib/strncmp_32.S
+++ b/arch/sparc/lib/strncmp_32.S
@@ -3,11 +3,10 @@
  *            generic strncmp routine.
  */
 
+#include <linux/linkage.h>
+
 	.text
-	.align 4
-	.global __strncmp, strncmp
-__strncmp:
-strncmp:
+ENTRY(strncmp)
 	mov	%o0, %g3
 	mov	0, %o3
 
@@ -116,3 +115,4 @@ strncmp:
 	and	%g2, 0xff, %o0
 	retl
 	 sub	%o3, %o0, %o0
+ENDPROC(strncmp)
diff --git a/arch/sparc/lib/strncmp_64.S b/arch/sparc/lib/strncmp_64.S
index 980e8375155..0656627166f 100644
--- a/arch/sparc/lib/strncmp_64.S
+++ b/arch/sparc/lib/strncmp_64.S
@@ -4,13 +4,11 @@
  * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
  */
 
+#include <linux/linkage.h>
 #include <asm/asi.h>
 
 	.text
-	.align	32
-	.globl	strncmp
-	.type	strncmp,#function
-strncmp:
+ENTRY(strncmp)
 	brlez,pn %o2, 3f
 	 lduba	[%o0] (ASI_PNF), %o3
 1:
@@ -29,4 +27,4 @@ strncmp:
 3:
 	retl
 	 clr	%o0
-	.size	strncmp, .-strncmp
+ENDPROC(strncmp)
diff --git a/arch/sparc/lib/strncpy_from_user_32.S b/arch/sparc/lib/strncpy_from_user_32.S
deleted file mode 100644
index d77198976a6..00000000000
--- a/arch/sparc/lib/strncpy_from_user_32.S
+++ /dev/null
@@ -1,47 +0,0 @@
-/* strncpy_from_user.S: Sparc strncpy from userspace.
- *
- *  Copyright(C) 1996 David S. Miller
- */
-
-#include <asm/ptrace.h>
-#include <asm/errno.h>
-
-	.text
-	.align	4
-
-	/* Must return:
-	 *
-	 * -EFAULT		for an exception
-	 * count		if we hit the buffer limit
-	 * bytes copied		if we hit a null byte
-	 */
-
-	.globl	__strncpy_from_user
-__strncpy_from_user:
-	/* %o0=dest, %o1=src, %o2=count */
-	mov	%o2, %o3
-1:
-	subcc	%o2, 1, %o2
-	bneg	2f
-	 nop
-10:
-	ldub	[%o1], %o4
-	add	%o0, 1, %o0
-	cmp	%o4, 0
-	add	%o1, 1, %o1
-	bne	1b
-	 stb	%o4, [%o0 - 1]
-2:
-	add	%o2, 1, %o0
-	retl
-	 sub	%o3, %o0, %o0
-
-	.section .fixup,#alloc,#execinstr
-	.align	4
-4:
-	retl
-	 mov	-EFAULT, %o0
-
-	.section __ex_table,#alloc
-	.align	4
-	.word	10b, 4b
diff --git a/arch/sparc/lib/strncpy_from_user_64.S b/arch/sparc/lib/strncpy_from_user_64.S
deleted file mode 100644
index 511c8f136f9..00000000000
--- a/arch/sparc/lib/strncpy_from_user_64.S
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * strncpy_from_user.S: Sparc64 strncpy from userspace.
- *
- *  Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
- */
-
-#include <asm/asi.h>
-#include <asm/errno.h>
-
-	.data
-	.align	8
-0:	.xword	0x0101010101010101
-
-	.text
-	.align	32
-
-	/* Must return:
-	 *
-	 * -EFAULT		for an exception
-	 * count		if we hit the buffer limit
-	 * bytes copied		if we hit a null byte
-	 * (without the null byte)
-	 *
-	 * This implementation assumes:
-	 * %o1 is 8 aligned => !(%o2 & 7)
-	 * %o0 is 8 aligned (if not, it will be slooooow, but will work)
-	 *
-	 * This is optimized for the common case:
-	 * in my stats, 90% of src are 8 aligned (even on sparc32)
-	 * and average length is 18 or so.
-	 */
-
-	.globl	__strncpy_from_user
-	.type	__strncpy_from_user,#function
-__strncpy_from_user:
-	/* %o0=dest, %o1=src, %o2=count */
-	andcc	%o1, 7, %g0		! IEU1	Group
-	bne,pn	%icc, 30f		! CTI
-	 add	%o0, %o2, %g3		! IEU0
-60:	ldxa	[%o1] %asi, %g1		! Load	Group
-	brlez,pn %o2, 10f		! CTI
-	 mov	%o0, %o3		! IEU0
-50:	sethi	%hi(0b), %o4		! IEU0	Group
-	ldx	[%o4 + %lo(0b)], %o4	! Load
-	sllx	%o4, 7, %o5		! IEU1	Group
-1:	sub	%g1, %o4, %g2		! IEU0	Group
-	stx	%g1, [%o0]		! Store
-	add	%o0, 8, %o0		! IEU1
-	andcc	%g2, %o5, %g0		! IEU1	Group
-	bne,pn	%xcc, 5f		! CTI
-	 add	%o1, 8, %o1		! IEU0
-	cmp	%o0, %g3		! IEU1	Group
-	bl,a,pt %xcc, 1b		! CTI
-61:	 ldxa	[%o1] %asi, %g1		! Load
-10:	retl				! CTI	Group
-	 mov	%o2, %o0		! IEU0
-5:	srlx	%g2, 32, %g7		! IEU0	Group
-	sethi	%hi(0xff00), %o4	! IEU1
-	andcc	%g7, %o5, %g0		! IEU1	Group
-	be,pn	%icc, 2f		! CTI
-	 or	%o4, %lo(0xff00), %o4	! IEU0
-	srlx	%g1, 48, %g7		! IEU0	Group
-	andcc	%g7, %o4, %g0		! IEU1	Group
-	be,pn	%icc, 50f		! CTI
-	 andcc	%g7, 0xff, %g0		! IEU1	Group
-	be,pn	%icc, 51f		! CTI
-	 srlx	%g1, 32, %g7		! IEU0
-	andcc	%g7, %o4, %g0		! IEU1	Group
-	be,pn	%icc, 52f		! CTI
-	 andcc	%g7, 0xff, %g0		! IEU1	Group
-	be,pn	%icc, 53f		! CTI
-2:	 andcc	%g2, %o5, %g0		! IEU1	Group
-	be,pn	%icc, 2f		! CTI
-	 srl	%g1, 16, %g7		! IEU0
-	andcc	%g7, %o4, %g0		! IEU1	Group
-	be,pn	%icc, 54f		! CTI
-	 andcc	%g7, 0xff, %g0		! IEU1	Group
-	be,pn	%icc, 55f		! CTI
-	 andcc	%g1, %o4, %g0		! IEU1	Group
-	be,pn	%icc, 56f		! CTI
-	 andcc	%g1, 0xff, %g0		! IEU1	Group
-	be,a,pn	%icc, 57f		! CTI
-	 sub	%o0, %o3, %o0		! IEU0
-2:	cmp	%o0, %g3		! IEU1	Group
-	bl,a,pt	%xcc, 50b		! CTI
-62:	 ldxa	[%o1] %asi, %g1		! Load
-	retl				! CTI	Group
-	 mov	%o2, %o0		! IEU0
-50:	sub	%o0, %o3, %o0
-	retl
-	 sub	%o0, 8, %o0
-51:	sub	%o0, %o3, %o0
-	retl
-	 sub	%o0, 7, %o0
-52:	sub	%o0, %o3, %o0
-	retl
-	 sub	%o0, 6, %o0
-53:	sub	%o0, %o3, %o0
-	retl
-	 sub	%o0, 5, %o0
-54:	sub	%o0, %o3, %o0
-	retl
-	 sub	%o0, 4, %o0
-55:	sub	%o0, %o3, %o0
-	retl
-	 sub	%o0, 3, %o0
-56:	sub	%o0, %o3, %o0
-	retl
-	 sub	%o0, 2, %o0
-57:	retl
-	 sub	%o0, 1, %o0
-30:	brlez,pn %o2, 3f
-	 sub	%g0, %o2, %o3
-	add	%o0, %o2, %o0
-63:	lduba	[%o1] %asi, %o4
-1:	add	%o1, 1, %o1
-	brz,pn	%o4, 2f
-	 stb	%o4, [%o0 + %o3]
-	addcc	%o3, 1, %o3
-	bne,pt	%xcc, 1b
-64:	 lduba	[%o1] %asi, %o4
-3:	retl
-	 mov	%o2, %o0
-2:	retl
-	 add	%o2, %o3, %o0
-	.size	__strncpy_from_user, .-__strncpy_from_user
-
-	.section __ex_table,"a"
-	.align	4
-	.word	60b, __retl_efault
-	.word	61b, __retl_efault
-	.word	62b, __retl_efault
-	.word	63b, __retl_efault
-	.word	64b, __retl_efault
-	.previous
diff --git a/arch/sparc/lib/ucmpdi2.c b/arch/sparc/lib/ucmpdi2.c
new file mode 100644
index 00000000000..1e06ed50068
--- /dev/null
+++ b/arch/sparc/lib/ucmpdi2.c
@@ -0,0 +1,19 @@
+#include <linux/module.h>
+#include "libgcc.h"
+
+word_type __ucmpdi2(unsigned long long a, unsigned long long b)
+{
+	const DWunion au = {.ll = a};
+	const DWunion bu = {.ll = b};
+
+	if ((unsigned int) au.s.high < (unsigned int) bu.s.high)
+		return 0;
+	else if ((unsigned int) au.s.high > (unsigned int) bu.s.high)
+		return 2;
+	if ((unsigned int) au.s.low < (unsigned int) bu.s.low)
+		return 0;
+	else if ((unsigned int) au.s.low > (unsigned int) bu.s.low)
+		return 2;
+	return 1;
+}
+EXPORT_SYMBOL(__ucmpdi2);
diff --git a/arch/sparc/lib/udiv.S b/arch/sparc/lib/udiv.S
deleted file mode 100644
index 2101405bdfc..00000000000
--- a/arch/sparc/lib/udiv.S
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * udiv.S:      This routine was taken from glibc-1.09 and is covered
- *              by the GNU Library General Public License Version 2.
- */
-
-
-/* This file is generated from divrem.m4; DO NOT EDIT! */
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- *  .udiv	name of function to generate
- *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1
- *  false		false=true => signed; false=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top decade of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-
-
-	.globl .udiv
-	.globl _Udiv
-.udiv:
-_Udiv:	/* needed for export */
-
-	! Ready to divide.  Compute size of quotient; scale comparand.
-	orcc	%o1, %g0, %o5
-	bne	1f
-	 mov	%o0, %o3
-
-		! Divide by zero trap.  If it returns, return 0 (about as
-		! wrong as possible, but that is what SunOS does...).
-		ta	ST_DIV0
-		retl
-		 clr	%o0
-
-1:
-	cmp	%o3, %o5			! if %o1 exceeds %o0, done
-	blu	Lgot_result		! (and algorithm fails otherwise)
-	 clr	%o2
-
-	sethi	%hi(1 << (32 - 4 - 1)), %g1
-
-	cmp	%o3, %g1
-	blu	Lnot_really_big
-	 clr	%o4
-
-	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
-	! as our usual N-at-a-shot divide step will cause overflow and havoc.
-	! The number of bits in the result here is N*ITER+SC, where SC <= N.
-	! Compute ITER in an unorthodox manner: know we need to shift V into
-	! the top decade: so do not even bother to compare to R.
-	1:
-		cmp	%o5, %g1
-		bgeu	3f
-		 mov	1, %g7
-
-		sll	%o5, 4, %o5
-
-		b	1b
-		 add	%o4, 1, %o4
-
-	! Now compute %g7.
-	2:
-		addcc	%o5, %o5, %o5
-		bcc	Lnot_too_big
-		 add	%g7, 1, %g7
-
-		! We get here if the %o1 overflowed while shifting.
-		! This means that %o3 has the high-order bit set.
-		! Restore %o5 and subtract from %o3.
-		sll	%g1, 4, %g1	! high order bit
-		srl	%o5, 1, %o5		! rest of %o5
-		add	%o5, %g1, %o5
-
-		b	Ldo_single_div
-		 sub	%g7, 1, %g7
-
-	Lnot_too_big:
-	3:
-		cmp	%o5, %o3
-		blu	2b
-		 nop
-
-		be	Ldo_single_div
-		 nop
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	! %o5 > %o3: went too far: back up 1 step
-	!	srl	%o5, 1, %o5
-	!	dec	%g7
-	! do single-bit divide steps
-	!
-	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
-	! first divide step without thinking.  BUT, the others are conditional,
-	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
-	! order bit set in the first step, just falling into the regular
-	! division loop will mess up the first time around.
-	! So we unroll slightly...
-	Ldo_single_div:
-		subcc	%g7, 1, %g7
-		bl	Lend_regular_divide
-		 nop
-
-		sub	%o3, %o5, %o3
-		mov	1, %o2
-
-		b	Lend_single_divloop
-		 nop
-	Lsingle_divloop:
-		sll	%o2, 1, %o2
-		bl	1f
-		 srl	%o5, 1, %o5
-		! %o3 >= 0
-		sub	%o3, %o5, %o3
-		b	2f
-		 add	%o2, 1, %o2
-	1:	! %o3 < 0
-		add	%o3, %o5, %o3
-		sub	%o2, 1, %o2
-	2:
-	Lend_single_divloop:
-		subcc	%g7, 1, %g7
-		bge	Lsingle_divloop
-		 tst	%o3
-
-		b,a	Lend_regular_divide
-
-Lnot_really_big:
-1:
-	sll	%o5, 4, %o5
-
-	cmp	%o5, %o3
-	bleu	1b
-	 addcc	%o4, 1, %o4
-
-	be	Lgot_result
-	 sub	%o4, 1, %o4
-
-	tst	%o3	! set up for initial iteration
-Ldivloop:
-	sll	%o2, 4, %o2
-		! depth 1, accumulated bits 0
-	bl	L.1.16
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 2, accumulated bits 1
-	bl	L.2.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 3
-	bl	L.3.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 7
-	bl	L.4.23
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2+1), %o2
-
-L.4.23:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2-1), %o2
-
-L.3.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 5
-	bl	L.4.21
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2+1), %o2
-
-L.4.21:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2-1), %o2
-
-L.2.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 1
-	bl	L.3.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 3
-	bl	L.4.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2+1), %o2
-
-L.4.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2-1), %o2
-
-L.3.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 1
-	bl	L.4.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2+1), %o2
-
-L.4.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2-1), %o2
-
-L.1.16:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 2, accumulated bits -1
-	bl	L.2.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -1
-	bl	L.3.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -1
-	bl	L.4.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2+1), %o2
-
-L.4.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2-1), %o2
-
-L.3.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -3
-	bl	L.4.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2+1), %o2
-
-L.4.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2-1), %o2
-
-L.2.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -3
-	bl	L.3.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -5
-	bl	L.4.11
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2+1), %o2
-
-L.4.11:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2-1), %o2
-
-L.3.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -7
-	bl	L.4.9
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2+1), %o2
-
-L.4.9:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2-1), %o2
-
-	9:
-Lend_regular_divide:
-	subcc	%o4, 1, %o4
-	bge	Ldivloop
-	 tst	%o3
-
-	bl,a	Lgot_result
-	! non-restoring fixup here (one instruction only!)
-	sub	%o2, 1, %o2
-
-Lgot_result:
-
-	retl
-	 mov %o2, %o0
-
-	.globl	.udiv_patch
-.udiv_patch:
-	wr	%g0, 0x0, %y
-	nop
-	nop
-	retl
-	 udiv	%o0, %o1, %o0
-	nop
diff --git a/arch/sparc/lib/udivdi3.S b/arch/sparc/lib/udivdi3.S
index b430f1f0ef6..24e0a355e2e 100644
--- a/arch/sparc/lib/udivdi3.S
+++ b/arch/sparc/lib/udivdi3.S
@@ -60,8 +60,9 @@ __udivdi3:
 	bne .LL77
 	mov %i0,%o2
 	mov 1,%o0
-	call .udiv,0
 	mov 0,%o1
+	wr %g0, 0, %y
+	udiv %o0, %o1, %o0
 	mov %o0,%o3
 	mov %i0,%o2
 .LL77:
diff --git a/arch/sparc/lib/umul.S b/arch/sparc/lib/umul.S
deleted file mode 100644
index 1f36ae68252..00000000000
--- a/arch/sparc/lib/umul.S
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * umul.S:      This routine was taken from glibc-1.09 and is covered
- *              by the GNU Library General Public License Version 2.
- */
-
-
-/*
- * Unsigned multiply.  Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
- * upper 32 bits of the 64-bit product).
- *
- * This code optimizes short (less than 13-bit) multiplies.  Short
- * multiplies require 25 instruction cycles, and long ones require
- * 45 instruction cycles.
- *
- * On return, overflow has occurred (%o1 is not zero) if and only if
- * the Z condition code is clear, allowing, e.g., the following:
- *
- *	call	.umul
- *	nop
- *	bnz	overflow	(or tnz)
- */
-
-	.globl .umul
-	.globl _Umul
-.umul:
-_Umul:	/* needed for export */
-	or	%o0, %o1, %o4
-	mov	%o0, %y		! multiplier -> Y
-
-	andncc	%o4, 0xfff, %g0	! test bits 12..31 of *both* args
-	be	Lmul_shortway	! if zero, can do it the short way
-	 andcc	%g0, %g0, %o4	! zero the partial product and clear N and V
-
-	/*
-	 * Long multiply.  32 steps, followed by a final shift step.
-	 */
-	mulscc	%o4, %o1, %o4	! 1
-	mulscc	%o4, %o1, %o4	! 2
-	mulscc	%o4, %o1, %o4	! 3
-	mulscc	%o4, %o1, %o4	! 4
-	mulscc	%o4, %o1, %o4	! 5
-	mulscc	%o4, %o1, %o4	! 6
-	mulscc	%o4, %o1, %o4	! 7
-	mulscc	%o4, %o1, %o4	! 8
-	mulscc	%o4, %o1, %o4	! 9
-	mulscc	%o4, %o1, %o4	! 10
-	mulscc	%o4, %o1, %o4	! 11
-	mulscc	%o4, %o1, %o4	! 12
-	mulscc	%o4, %o1, %o4	! 13
-	mulscc	%o4, %o1, %o4	! 14
-	mulscc	%o4, %o1, %o4	! 15
-	mulscc	%o4, %o1, %o4	! 16
-	mulscc	%o4, %o1, %o4	! 17
-	mulscc	%o4, %o1, %o4	! 18
-	mulscc	%o4, %o1, %o4	! 19
-	mulscc	%o4, %o1, %o4	! 20
-	mulscc	%o4, %o1, %o4	! 21
-	mulscc	%o4, %o1, %o4	! 22
-	mulscc	%o4, %o1, %o4	! 23
-	mulscc	%o4, %o1, %o4	! 24
-	mulscc	%o4, %o1, %o4	! 25
-	mulscc	%o4, %o1, %o4	! 26
-	mulscc	%o4, %o1, %o4	! 27
-	mulscc	%o4, %o1, %o4	! 28
-	mulscc	%o4, %o1, %o4	! 29
-	mulscc	%o4, %o1, %o4	! 30
-	mulscc	%o4, %o1, %o4	! 31
-	mulscc	%o4, %o1, %o4	! 32
-	mulscc	%o4, %g0, %o4	! final shift
-
-
-	/*
-	 * Normally, with the shift-and-add approach, if both numbers are
-	 * positive you get the correct result.  With 32-bit two's-complement
-	 * numbers, -x is represented as
-	 *
-	 *		  x		    32
-	 *	( 2  -  ------ ) mod 2  *  2
-	 *		   32
-	 *		  2
-	 *
-	 * (the `mod 2' subtracts 1 from 1.bbbb).  To avoid lots of 2^32s,
-	 * we can treat this as if the radix point were just to the left
-	 * of the sign bit (multiply by 2^32), and get
-	 *
-	 *	-x  =  (2 - x) mod 2
-	 *
-	 * Then, ignoring the `mod 2's for convenience:
-	 *
-	 *   x *  y	= xy
-	 *  -x *  y	= 2y - xy
-	 *   x * -y	= 2x - xy
-	 *  -x * -y	= 4 - 2x - 2y + xy
-	 *
-	 * For signed multiplies, we subtract (x << 32) from the partial
-	 * product to fix this problem for negative multipliers (see mul.s).
-	 * Because of the way the shift into the partial product is calculated
-	 * (N xor V), this term is automatically removed for the multiplicand,
-	 * so we don't have to adjust.
-	 *
-	 * But for unsigned multiplies, the high order bit wasn't a sign bit,
-	 * and the correction is wrong.  So for unsigned multiplies where the
-	 * high order bit is one, we end up with xy - (y << 32).  To fix it
-	 * we add y << 32.
-	 */
-#if 0
-	tst	%o1
-	bl,a	1f		! if %o1 < 0 (high order bit = 1),
-	 add	%o4, %o0, %o4	! %o4 += %o0 (add y to upper half)
-
-1:
-	rd	%y, %o0		! get lower half of product
-	retl
-	 addcc	%o4, %g0, %o1	! put upper half in place and set Z for %o1==0
-#else
-	/* Faster code from tege@sics.se.  */
-	sra	%o1, 31, %o2	! make mask from sign bit
-	and	%o0, %o2, %o2	! %o2 = 0 or %o0, depending on sign of %o1
-	rd	%y, %o0		! get lower half of product
-	retl
-	 addcc	%o4, %o2, %o1	! add compensation and put upper half in place
-#endif
-
-Lmul_shortway:
-	/*
-	 * Short multiply.  12 steps, followed by a final shift step.
-	 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
-	 * but there is no problem with %o0 being negative (unlike above),
-	 * and overflow is impossible (the answer is at most 24 bits long).
-	 */
-	mulscc	%o4, %o1, %o4	! 1
-	mulscc	%o4, %o1, %o4	! 2
-	mulscc	%o4, %o1, %o4	! 3
-	mulscc	%o4, %o1, %o4	! 4
-	mulscc	%o4, %o1, %o4	! 5
-	mulscc	%o4, %o1, %o4	! 6
-	mulscc	%o4, %o1, %o4	! 7
-	mulscc	%o4, %o1, %o4	! 8
-	mulscc	%o4, %o1, %o4	! 9
-	mulscc	%o4, %o1, %o4	! 10
-	mulscc	%o4, %o1, %o4	! 11
-	mulscc	%o4, %o1, %o4	! 12
-	mulscc	%o4, %g0, %o4	! final shift
-
-	/*
-	 * %o4 has 20 of the bits that should be in the result; %y has
-	 * the bottom 12 (as %y's top 12).  That is:
-	 *
-	 *	  %o4		    %y
-	 * +----------------+----------------+
-	 * | -12- |   -20-  | -12- |   -20-  |
-	 * +------(---------+------)---------+
-	 *	   -----result-----
-	 *
-	 * The 12 bits of %o4 left of the `result' area are all zero;
-	 * in fact, all top 20 bits of %o4 are zero.
-	 */
-
-	rd	%y, %o5
-	sll	%o4, 12, %o0	! shift middle bits left 12
-	srl	%o5, 20, %o5	! shift low bits right 20
-	or	%o5, %o0, %o0
-	retl
-	 addcc	%g0, %g0, %o1	! %o1 = zero, and set Z
-
-	.globl	.umul_patch
-.umul_patch:
-	umul	%o0, %o1, %o0
-	retl
-	 rd	%y, %o1
-	nop
diff --git a/arch/sparc/lib/urem.S b/arch/sparc/lib/urem.S
deleted file mode 100644
index 77123eb83c4..00000000000
--- a/arch/sparc/lib/urem.S
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * urem.S:      This routine was taken from glibc-1.09 and is covered
- *              by the GNU Library General Public License Version 2.
- */
-
-/* This file is generated from divrem.m4; DO NOT EDIT! */
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- *  .urem	name of function to generate
- *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1
- *  false		false=true => signed; false=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top decade of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-
-	.globl .urem
-	.globl _Urem
-.urem:
-_Urem:	/* needed for export */
-
-	! Ready to divide.  Compute size of quotient; scale comparand.
-	orcc	%o1, %g0, %o5
-	bne	1f
-	 mov	%o0, %o3
-
-		! Divide by zero trap.  If it returns, return 0 (about as
-		! wrong as possible, but that is what SunOS does...).
-		ta	ST_DIV0
-		retl
-		 clr	%o0
-
-1:
-	cmp	%o3, %o5			! if %o1 exceeds %o0, done
-	blu	Lgot_result		! (and algorithm fails otherwise)
-	 clr	%o2
-
-	sethi	%hi(1 << (32 - 4 - 1)), %g1
-
-	cmp	%o3, %g1
-	blu	Lnot_really_big
-	 clr	%o4
-
-	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
-	! as our usual N-at-a-shot divide step will cause overflow and havoc.
-	! The number of bits in the result here is N*ITER+SC, where SC <= N.
-	! Compute ITER in an unorthodox manner: know we need to shift V into
-	! the top decade: so do not even bother to compare to R.
-	1:
-		cmp	%o5, %g1
-		bgeu	3f
-		 mov	1, %g7
-
-		sll	%o5, 4, %o5
-
-		b	1b
-		 add	%o4, 1, %o4
-
-	! Now compute %g7.
-	2:
-		addcc	%o5, %o5, %o5
-		bcc	Lnot_too_big
-		 add	%g7, 1, %g7
-
-		! We get here if the %o1 overflowed while shifting.
-		! This means that %o3 has the high-order bit set.
-		! Restore %o5 and subtract from %o3.
-		sll	%g1, 4, %g1	! high order bit
-		srl	%o5, 1, %o5		! rest of %o5
-		add	%o5, %g1, %o5
-
-		b	Ldo_single_div
-		 sub	%g7, 1, %g7
-
-	Lnot_too_big:
-	3:
-		cmp	%o5, %o3
-		blu	2b
-		 nop
-
-		be	Ldo_single_div
-		 nop
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	! %o5 > %o3: went too far: back up 1 step
-	!	srl	%o5, 1, %o5
-	!	dec	%g7
-	! do single-bit divide steps
-	!
-	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
-	! first divide step without thinking.  BUT, the others are conditional,
-	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
-	! order bit set in the first step, just falling into the regular
-	! division loop will mess up the first time around.
-	! So we unroll slightly...
-	Ldo_single_div:
-		subcc	%g7, 1, %g7
-		bl	Lend_regular_divide
-		 nop
-
-		sub	%o3, %o5, %o3
-		mov	1, %o2
-
-		b	Lend_single_divloop
-		 nop
-	Lsingle_divloop:
-		sll	%o2, 1, %o2
-		bl	1f
-		 srl	%o5, 1, %o5
-		! %o3 >= 0
-		sub	%o3, %o5, %o3
-		b	2f
-		 add	%o2, 1, %o2
-	1:	! %o3 < 0
-		add	%o3, %o5, %o3
-		sub	%o2, 1, %o2
-	2:
-	Lend_single_divloop:
-		subcc	%g7, 1, %g7
-		bge	Lsingle_divloop
-		 tst	%o3
-
-		b,a	Lend_regular_divide
-
-Lnot_really_big:
-1:
-	sll	%o5, 4, %o5
-
-	cmp	%o5, %o3
-	bleu	1b
-	 addcc	%o4, 1, %o4
-
-	be	Lgot_result
-	 sub	%o4, 1, %o4
-
-	tst	%o3	! set up for initial iteration
-Ldivloop:
-	sll	%o2, 4, %o2
-		! depth 1, accumulated bits 0
-	bl	L.1.16
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 2, accumulated bits 1
-	bl	L.2.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 3
-	bl	L.3.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 7
-	bl	L.4.23
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2+1), %o2
-
-L.4.23:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (7*2-1), %o2
-
-L.3.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 5
-	bl	L.4.21
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2+1), %o2
-
-L.4.21:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (5*2-1), %o2
-
-L.2.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 1
-	bl	L.3.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 3
-	bl	L.4.19
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2+1), %o2
-
-L.4.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (3*2-1), %o2
-
-L.3.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 1
-	bl	L.4.17
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2+1), %o2
-	
-L.4.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (1*2-1), %o2
-
-L.1.16:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 2, accumulated bits -1
-	bl	L.2.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -1
-	bl	L.3.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -1
-	bl	L.4.15
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2+1), %o2
-
-L.4.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-1*2-1), %o2
-
-L.3.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -3
-	bl	L.4.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2+1), %o2
-
-L.4.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-3*2-1), %o2
-
-L.2.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -3
-	bl	L.3.13
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -5
-	bl	L.4.11
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2+1), %o2
-	
-L.4.11:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-5*2-1), %o2
-
-L.3.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -7
-	bl	L.4.9
-	 srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2+1), %o2
-
-L.4.9:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-	b	9f
-	 add	%o2, (-7*2-1), %o2
-
-	9:
-Lend_regular_divide:
-	subcc	%o4, 1, %o4
-	bge	Ldivloop
-	 tst	%o3
-
-	bl,a	Lgot_result
-	! non-restoring fixup here (one instruction only!)
-	add	%o3, %o1, %o3
-
-Lgot_result:
-
-	retl
-	 mov %o3, %o0
-
-	.globl	.urem_patch
-.urem_patch:
-	wr	%g0, 0x0, %y
-	nop
-	nop
-	nop
-	udiv	%o0, %o1, %o2
-	umul	%o2, %o1, %o2
-	retl
-	 sub	%o0, %o2, %o0
diff --git a/arch/sparc/lib/usercopy.c b/arch/sparc/lib/usercopy.c
deleted file mode 100644
index 14b363fec8a..00000000000
--- a/arch/sparc/lib/usercopy.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <linux/module.h>
-#include <linux/bug.h>
-
-void copy_from_user_overflow(void)
-{
-	WARN(1, "Buffer overflow detected!\n");
-}
-EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/arch/sparc/lib/xor.S b/arch/sparc/lib/xor.S
index f44f58f4023..2c05641c326 100644
--- a/arch/sparc/lib/xor.S
+++ b/arch/sparc/lib/xor.S
@@ -8,6 +8,7 @@
  * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
  */
 
+#include <linux/linkage.h>
 #include <asm/visasm.h>
 #include <asm/asi.h>
 #include <asm/dcu.h>
@@ -19,12 +20,9 @@
  *	!(len & 127) && len >= 256
  */
 	.text
-	.align	32
 
 	/* VIS versions. */
-	.globl	xor_vis_2
-	.type	xor_vis_2,#function
-xor_vis_2:
+ENTRY(xor_vis_2)
 	rd	%fprs, %o5
 	andcc	%o5, FPRS_FEF|FPRS_DU, %g0
 	be,pt	%icc, 0f
@@ -91,11 +89,9 @@ xor_vis_2:
 	wr	%g1, %g0, %asi
 	retl
 	  wr	%g0, 0, %fprs
-	.size	xor_vis_2, .-xor_vis_2
+ENDPROC(xor_vis_2)
 
-	.globl	xor_vis_3
-	.type	xor_vis_3,#function
-xor_vis_3:
+ENTRY(xor_vis_3)
 	rd	%fprs, %o5
 	andcc	%o5, FPRS_FEF|FPRS_DU, %g0
 	be,pt	%icc, 0f
@@ -159,11 +155,9 @@ xor_vis_3:
 	wr	%g1, %g0, %asi
 	retl
 	 wr	%g0, 0, %fprs
-	.size	xor_vis_3, .-xor_vis_3
+ENDPROC(xor_vis_3)
 
-	.globl	xor_vis_4
-	.type	xor_vis_4,#function
-xor_vis_4:
+ENTRY(xor_vis_4)
 	rd	%fprs, %o5
 	andcc	%o5, FPRS_FEF|FPRS_DU, %g0
 	be,pt	%icc, 0f
@@ -246,11 +240,9 @@ xor_vis_4:
 	wr	%g1, %g0, %asi
 	retl
 	 wr	%g0, 0, %fprs
-	.size	xor_vis_4, .-xor_vis_4
+ENDPROC(xor_vis_4)
 
-	.globl	xor_vis_5
-	.type	xor_vis_5,#function
-xor_vis_5:
+ENTRY(xor_vis_5)
 	save	%sp, -192, %sp
 	rd	%fprs, %o5
 	andcc	%o5, FPRS_FEF|FPRS_DU, %g0
@@ -354,12 +346,10 @@ xor_vis_5:
 	wr	%g0, 0, %fprs
 	ret
 	 restore
-	.size	xor_vis_5, .-xor_vis_5
+ENDPROC(xor_vis_5)
 
 	/* Niagara versions. */
-	.globl		xor_niagara_2
-	.type		xor_niagara_2,#function
-xor_niagara_2:		/* %o0=bytes, %o1=dest, %o2=src */
+ENTRY(xor_niagara_2) /* %o0=bytes, %o1=dest, %o2=src */
 	save		%sp, -192, %sp
 	prefetch	[%i1], #n_writes
 	prefetch	[%i2], #one_read
@@ -402,11 +392,9 @@ xor_niagara_2:		/* %o0=bytes, %o1=dest, %o2=src */
 	wr		%g7, 0x0, %asi
 	ret
 	 restore
-	.size		xor_niagara_2, .-xor_niagara_2
+ENDPROC(xor_niagara_2)
 
-	.globl		xor_niagara_3
-	.type		xor_niagara_3,#function
-xor_niagara_3:		/* %o0=bytes, %o1=dest, %o2=src1, %o3=src2 */
+ENTRY(xor_niagara_3) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2 */
 	save		%sp, -192, %sp
 	prefetch	[%i1], #n_writes
 	prefetch	[%i2], #one_read
@@ -465,11 +453,9 @@ xor_niagara_3:		/* %o0=bytes, %o1=dest, %o2=src1, %o3=src2 */
 	wr		%g7, 0x0, %asi
 	ret
 	 restore
-	.size		xor_niagara_3, .-xor_niagara_3
+ENDPROC(xor_niagara_3)
 
-	.globl		xor_niagara_4
-	.type		xor_niagara_4,#function
-xor_niagara_4:		/* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3 */
+ENTRY(xor_niagara_4) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3 */
 	save		%sp, -192, %sp
 	prefetch	[%i1], #n_writes
 	prefetch	[%i2], #one_read
@@ -549,11 +535,9 @@ xor_niagara_4:		/* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3 */
 	wr		%g7, 0x0, %asi
 	ret
 	 restore
-	.size		xor_niagara_4, .-xor_niagara_4
+ENDPROC(xor_niagara_4)
 
-	.globl		xor_niagara_5
-	.type		xor_niagara_5,#function
-xor_niagara_5:		/* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3, %o5=src4 */
+ENTRY(xor_niagara_5) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3, %o5=src4 */
 	save		%sp, -192, %sp
 	prefetch	[%i1], #n_writes
 	prefetch	[%i2], #one_read
@@ -649,4 +633,4 @@ xor_niagara_5:		/* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3, %o5=src4 *
 	wr		%g7, 0x0, %asi
 	ret
 	 restore
-	.size		xor_niagara_5, .-xor_niagara_5
+ENDPROC(xor_niagara_5)