12 files changed, 488 insertions, 826 deletions
diff --git a/arch/cris/arch-v32/lib/Makefile b/arch/cris/arch-v32/lib/Makefile
index 05b3ec6978d..dd296b9db03 100644
--- a/arch/cris/arch-v32/lib/Makefile
+++ b/arch/cris/arch-v32/lib/Makefile
@@ -2,5 +2,6 @@
 # Makefile for Etrax-specific library files..
 #
 
-lib-y  = checksum.o checksumcopy.o string.o usercopy.o memset.o csumcpfruser.o spinlock.o
+lib-y  = checksum.o checksumcopy.o string.o usercopy.o memset.o \
+	csumcpfruser.o spinlock.o delay.o strcmp.o
 
diff --git a/arch/cris/arch-v32/lib/checksum.S b/arch/cris/arch-v32/lib/checksum.S
index 32e66181b82..4a72a94a49a 100644
--- a/arch/cris/arch-v32/lib/checksum.S
+++ b/arch/cris/arch-v32/lib/checksum.S
@@ -1,41 +1,35 @@
 /*
  * A fast checksum routine using movem
- * Copyright (c) 1998-2001, 2003 Axis Communications AB
+ * Copyright (c) 1998-2007 Axis Communications AB
  *
  * csum_partial(const unsigned char * buff, int len, unsigned int sum)
  */
 
 	.globl	csum_partial
+	.type   csum_partial,@function
 csum_partial:
 
 	;; r10 - src
 	;; r11 - length
 	;; r12 - checksum
 
-	;; check for breakeven length between movem and normal word looping versions
-	;; we also do _NOT_ want to compute a checksum over more than the
-	;; actual length when length < 40
-
-	cmpu.w	80,$r11
-	blo	_word_loop
-	nop
-
-	;; need to save the registers we use below in the movem loop
-	;; this overhead is why we have a check above for breakeven length
-	;; only r0 - r8 have to be saved, the other ones are clobber-able
-	;; according to the ABI
+	;; Optimized for large packets
+	subq	10*4, $r11
+	blt	_word_loop
+	move.d	$r11, $acr
 
 	subq	9*4,$sp
-	subq	10*4,$r11	; update length for the first loop
+	clearf	c
 	movem	$r8,[$sp]
 
 	;; do a movem checksum
 
 _mloop:	movem	[$r10+],$r9	; read 10 longwords
-
+	;; Loop count without touching the c flag.
+	addoq	-10*4, $acr, $acr
 	;; perform dword checksumming on the 10 longwords
 
-	add.d	$r0,$r12
+	addc	$r0,$r12
 	addc	$r1,$r12
 	addc	$r2,$r12
 	addc	$r3,$r12
@@ -46,60 +40,41 @@ _mloop:	movem	[$r10+],$r9	; read 10 longwords
 	addc	$r8,$r12
 	addc	$r9,$r12
 
-	;; fold the carry into the checksum, to avoid having to loop the carry
-	;; back into the top
-
-	addc	0,$r12
-	addc	0,$r12		; do it again, since we might have generated a carry
-
-	subq	10*4,$r11
-	bge	_mloop
-	nop
-
-	addq	10*4,$r11	; compensate for last loop underflowing length
+	;; test $acr without trashing carry.
+	move.d	$acr, $acr
+	bpl	_mloop
+	;; r11 <= acr  is not really needed in the mloop, just using the dslot
+	;; to prepare for what is needed after mloop.
+	move.d	$acr, $r11
 
+	;; fold the last carry into r13
+	addc	0, $r12
 	movem	[$sp+],$r8	; restore regs
 
 _word_loop:
-	;; only fold if there is anything to fold.
-
-	cmpq	0,$r12
-	beq	_no_fold
-
-	;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below.
-	;; r9 and r13 can be used as temporaries.
+	addq	10*4,$r11	; compensate for last loop underflowing length
 
 	moveq	-1,$r9		; put 0xffff in r9, faster than move.d 0xffff,r9
 	lsrq	16,$r9
 
 	move.d	$r12,$r13
 	lsrq	16,$r13		; r13 = checksum >> 16
-	and.d	$r9,$r12		; checksum = checksum & 0xffff
-	add.d	$r13,$r12		; checksum += r13
-	move.d	$r12,$r13		; do the same again, maybe we got a carry last add
-	lsrq	16,$r13
-	and.d	$r9,$r12
-	add.d	$r13,$r12
+	and.d	$r9,$r12	; checksum = checksum & 0xffff
 
 _no_fold:
-	cmpq	2,$r11
+	subq	2,$r11
 	blt	_no_words
-	nop
+	add.d	$r13,$r12	; checksum += r13
 
 	;; checksum the rest of the words
-
-	subq	2,$r11
-
 _wloop:	subq	2,$r11
 	bge	_wloop
 	addu.w	[$r10+],$r12
 
-	addq	2,$r11
-
 _no_words:
+	addq	2,$r11
 	;; see if we have one odd byte more
-	cmpq	1,$r11
-	beq	_do_byte
+	bne	_do_byte
 	nop
 	ret
 	move.d	$r12,$r10
@@ -109,3 +84,5 @@ _do_byte:
 	addu.b	[$r10],$r12
 	ret
 	move.d	$r12,$r10
+
+	.size   csum_partial, .-csum_partial
diff --git a/arch/cris/arch-v32/lib/checksumcopy.S b/arch/cris/arch-v32/lib/checksumcopy.S
index 9303ccbadc6..54e209f18b0 100644
--- a/arch/cris/arch-v32/lib/checksumcopy.S
+++ b/arch/cris/arch-v32/lib/checksumcopy.S
@@ -1,6 +1,6 @@
 /*
  * A fast checksum+copy routine using movem
- * Copyright (c) 1998, 2001, 2003 Axis Communications AB
+ * Copyright (c) 1998-2007 Axis Communications AB
  *
  * Authors:	Bjorn Wesen
  *
@@ -9,6 +9,7 @@
  */
 
 	.globl	csum_partial_copy_nocheck
+	.type   csum_partial_copy_nocheck,@function
 csum_partial_copy_nocheck:
 
 	;; r10 - src
@@ -16,32 +17,23 @@ csum_partial_copy_nocheck:
 	;; r12 - length
 	;; r13 - checksum
 
-	;; check for breakeven length between movem and normal word looping versions
-	;; we also do _NOT_ want to compute a checksum over more than the
-	;; actual length when length < 40
-
-	cmpu.w	80,$r12
-	blo	_word_loop
-	nop
-
-	;; need to save the registers we use below in the movem loop
-	;; this overhead is why we have a check above for breakeven length
-	;; only r0 - r8 have to be saved, the other ones are clobber-able
-	;; according to the ABI
+	;; Optimized for large packets
+	subq	10*4, $r12
+	blt	_word_loop
+	move.d	$r12, $acr
 
 	subq	9*4,$sp
-	subq	10*4,$r12	; update length for the first loop
+	clearf	c
 	movem	$r8,[$sp]
 
 	;; do a movem copy and checksum
-
 1:	;; A failing userspace access (the read) will have this as PC.
 _mloop:	movem	[$r10+],$r9	; read 10 longwords
+	addoq	-10*4, $acr, $acr ; loop counter in latency cycle
 	movem	$r9,[$r11+]	; write 10 longwords
 
 	;; perform dword checksumming on the 10 longwords
-
-	add.d	$r0,$r13
+	addc	$r0,$r13
 	addc	$r1,$r13
 	addc	$r2,$r13
 	addc	$r3,$r13
@@ -52,47 +44,30 @@ _mloop:	movem	[$r10+],$r9	; read 10 longwords
 	addc	$r8,$r13
 	addc	$r9,$r13
 
-	;; fold the carry into the checksum, to avoid having to loop the carry
-	;; back into the top
-
-	addc	0,$r13
-	addc	0,$r13		; do it again, since we might have generated a carry
-
-	subq	10*4,$r12
-	bge	_mloop
-	nop
-
-	addq	10*4,$r12	; compensate for last loop underflowing length
+	;; test $acr, without trashing carry.
+	move.d	$acr, $acr
+	bpl	_mloop
+	;; r12 <= acr  is needed after mloop and in the exception handlers.
+	move.d	$acr, $r12
 
+	;; fold the last carry into r13
+	addc	0, $r13
 	movem	[$sp+],$r8	; restore regs
 
 _word_loop:
-	;; only fold if there is anything to fold.
-
-	cmpq	0,$r13
-	beq	_no_fold
+	addq	10*4,$r12	; compensate for last loop underflowing length
 
 	;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below
 	;; r9 can be used as temporary.
-
 	move.d	$r13,$r9
 	lsrq	16,$r9		; r0 = checksum >> 16
 	and.d	0xffff,$r13	; checksum = checksum & 0xffff
-	add.d	$r9,$r13	; checksum += r0
-	move.d	$r13,$r9	; do the same again, maybe we got a carry last add
-	lsrq	16,$r9
-	and.d	0xffff,$r13
-	add.d	$r9,$r13
 
-_no_fold:
-	cmpq	2,$r12
+	subq	2, $r12
 	blt	_no_words
-	nop
+	add.d	$r9,$r13	; checksum += r0
 
 	;; copy and checksum the rest of the words
-
-	subq	2,$r12
-
 2:	;; A failing userspace access for the read below will have this as PC.
 _wloop:	move.w	[$r10+],$r9
 	addu.w	$r9,$r13
@@ -100,12 +75,9 @@ _wloop:	move.w	[$r10+],$r9
 	bge	_wloop
 	move.w	$r9,[$r11+]
 
-	addq	2,$r12
-
 _no_words:
-	;; see if we have one odd byte more
-	cmpq	1,$r12
-	beq	_do_byte
+	addq	2,$r12
+	bne	_do_byte
 	nop
 	ret
 	move.d	$r13,$r10
@@ -118,3 +90,5 @@ _do_byte:
 	move.b	$r9,[$r11]
 	ret
 	move.d	$r13,$r10
+
+	.size   csum_partial_copy_nocheck, . - csum_partial_copy_nocheck
diff --git a/arch/cris/arch-v32/lib/delay.c b/arch/cris/arch-v32/lib/delay.c
new file mode 100644
index 00000000000..39f1ac9995b
--- /dev/null
+++ b/arch/cris/arch-v32/lib/delay.c
@@ -0,0 +1,28 @@
+/*
+ * Precise Delay Loops for ETRAX FS
+ *
+ * Copyright (C) 2006 Axis Communications AB.
+ *
+ */
+
+#include <hwregs/reg_map.h>
+#include <hwregs/reg_rdwr.h>
+#include <hwregs/timer_defs.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+
+/*
+ * On ETRAX FS, we can check the free-running read-only 100MHz timer
+ * getting 32-bit 10ns precision, theoretically good for 42.94967295
+ * seconds.  Unsigned arithmetic and careful expression handles
+ * wrapping.
+ */
+
+void cris_delay10ns(u32 n10ns)
+{
+	u32 t0 = REG_RD(timer, regi_timer0, r_time);
+	while (REG_RD(timer, regi_timer0, r_time) - t0 < n10ns)
+		;
+}
+EXPORT_SYMBOL(cris_delay10ns);
diff --git a/arch/cris/arch-v32/lib/dram_init.S b/arch/cris/arch-v32/lib/dram_init.S
deleted file mode 100644
index 158b3dbb4d9..00000000000
--- a/arch/cris/arch-v32/lib/dram_init.S
+++ /dev/null
@@ -1,119 +0,0 @@
-/* $Id: dram_init.S,v 1.4 2005/04/24 18:48:32 starvik Exp $
- *
- * DRAM/SDRAM initialization - alter with care
- * This file is intended to be included from other assembler files
- *
- * Note: This file may not modify r8 or r9 because they are used to
- * carry information from the decompresser to the kernel
- *
- * Copyright (C) 2000-2003 Axis Communications AB
- *
- * Authors:  Mikael Starvik (starvik@axis.com)
- */
-
-/* Just to be certain the config file is included, we include it here
- * explicitely instead of depending on it being included in the file that
- * uses this code.
- */
-
-#include <asm/arch/hwregs/asm/reg_map_asm.h>
-#include <asm/arch/hwregs/asm/bif_core_defs_asm.h>
-
-	;; WARNING! The registers r8 and r9 are used as parameters carrying
-	;; information from the decompressor (if the kernel was compressed).
-	;; They should not be used in the code below.
-
-	; Refer to BIF MDS for a description of SDRAM initialization
-
-	; Bank configuration
-	move.d   REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp0), $r0
-	move.d   CONFIG_ETRAX_SDRAM_GRP0_CONFIG, $r1
-	move.d   $r1, [$r0]
-	move.d   REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp1), $r0
-	move.d   CONFIG_ETRAX_SDRAM_GRP1_CONFIG, $r1
-	move.d   $r1, [$r0]
-
-	; Calculate value of mrs_data
-	; CAS latency = 2 && bus_width = 32 => 0x40
-	; CAS latency = 3 && bus_width = 32 => 0x60
-	; CAS latency = 2 && bus_width = 16 => 0x20
-	; CAS latency = 3 && bus_width = 16 => 0x30
-
-	; Check if value is already supplied in kernel config
-	move.d   CONFIG_ETRAX_SDRAM_COMMAND, $r2
-	bne	 _set_timing
-	nop
-
-	move.d   0x40, $r4       ; Assume 32 bits and CAS latency = 2
-	move.d   CONFIG_ETRAX_SDRAM_TIMING, $r1
- 	and.d    0x07, $r1       ; Get CAS latency
-	cmpq	 2, $r1		 ; CL = 2 ?
-	beq	 _bw_check
-	nop
-	move.d   0x60, $r4
-
-_bw_check:
-	; Assume that group 0 width is equal to group 1. This assumption
-	; is wrong for a group 1 only hardware (such as the grand old
-	; StorPoint+).
-	move.d   CONFIG_ETRAX_SDRAM_GRP0_CONFIG, $r1
-	and.d    0x200, $r1	; DRAM width is bit 9
-	beq      _set_timing
-	lslq	 2, $r4		;  mrs_data starts at bit 2
-	lsrq     1, $r4		;  16 bits. Shift down value.
-
-	; Set timing parameters (refresh off to avoid Guinness TR 83)
-_set_timing:
-	move.d   CONFIG_ETRAX_SDRAM_TIMING, $r1
-	and.d    ~(3 << reg_bif_core_rw_sdram_timing___ref___lsb), $r1
-	move.d   REG_ADDR(bif_core, regi_bif_core, rw_sdram_timing), $r0
-	move.d   $r1, [$r0]
-
-	; Issue NOP command
-	move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_cmd), $r5
-	moveq regk_bif_core_nop, $r1
-	move.d $r1, [$r5]
-
-	; Wait 200us
-	move.d   10000, $r2
-1:	bne      1b
-	subq     1, $r2
-
-	; Issue initialization command sequence
-	move.d   _sdram_commands_start, $r2
-	and.d    0x000fffff, $r2 ; Make sure commands are read from flash
-	move.d   _sdram_commands_end,  $r3
-	and.d    0x000fffff, $r3
-1:	clear.d  $r6
-	move.b   [$r2+], $r6 	; Load command
-	or.d     $r4, $r6	; Add calculated mrs
-	move.d   $r6, [$r5]	; Write rw_sdram_cmd
-	; Wait 80 ns between each command
-	move.d	 4000, $r7
-2:	bne	 2b
-	subq	 1, $r7
-	cmp.d    $r2, $r3	; Last command?
-	bne      1b
-	nop
-
-	; Start refresh
-	move.d   CONFIG_ETRAX_SDRAM_TIMING, $r1
-	move.d   REG_ADDR(bif_core, regi_bif_core, rw_sdram_timing), $r0
-	move.d   $r1, [$r0]
-
-	; Initialization finished
-	ba       _sdram_commands_end
-	nop
-
-_sdram_commands_start:
-	.byte   regk_bif_core_pre ; Precharge
-	.byte   regk_bif_core_ref ; refresh
-	.byte   regk_bif_core_ref ; refresh
-	.byte   regk_bif_core_ref ; refresh
-	.byte   regk_bif_core_ref ; refresh
-	.byte   regk_bif_core_ref ; refresh
-	.byte   regk_bif_core_ref ; refresh
-	.byte   regk_bif_core_ref ; refresh
-	.byte   regk_bif_core_ref ; refresh
-	.byte   regk_bif_core_mrs ; mrs
-_sdram_commands_end:
diff --git a/arch/cris/arch-v32/lib/hw_settings.S b/arch/cris/arch-v32/lib/hw_settings.S
deleted file mode 100644
index fff9443513d..00000000000
--- a/arch/cris/arch-v32/lib/hw_settings.S
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * $Id: hw_settings.S,v 1.3 2005/04/24 18:36:57 starvik Exp $
- *
- * This table is used by some tools to extract hardware parameters.
- * The table should be included in the kernel and the decompressor.
- * Don't forget to update the tools if you change this table.
- *
- * Copyright (C) 2001 Axis Communications AB
- *
- * Authors:  Mikael Starvik (starvik@axis.com)
- */
-
-#include <asm/arch/hwregs/asm/reg_map_asm.h>
-#include <asm/arch/hwregs/asm/bif_core_defs_asm.h>
-#include <asm/arch/hwregs/asm/gio_defs_asm.h>
-
-	.ascii "HW_PARAM_MAGIC" ; Magic number
-	.dword 0xc0004000	; Kernel start address
-
-	; Debug port
-#ifdef CONFIG_ETRAX_DEBUG_PORT0
-	.dword 0
-#elif defined(CONFIG_ETRAX_DEBUG_PORT1)
-	.dword 1
-#elif defined(CONFIG_ETRAX_DEBUG_PORT2)
-	.dword 2
-#elif defined(CONFIG_ETRAX_DEBUG_PORT3)
-	.dword 3
-#else
-	.dword 4 ; No debug
-#endif
-
-	; Register values
-	.dword REG_ADDR(bif_core, regi_bif_core, rw_grp1_cfg)
-	.dword CONFIG_ETRAX_MEM_GRP1_CONFIG
-	.dword REG_ADDR(bif_core, regi_bif_core, rw_grp2_cfg)
-	.dword CONFIG_ETRAX_MEM_GRP2_CONFIG
-	.dword REG_ADDR(bif_core, regi_bif_core, rw_grp3_cfg)
-	.dword CONFIG_ETRAX_MEM_GRP3_CONFIG
-	.dword REG_ADDR(bif_core, regi_bif_core, rw_grp4_cfg)
-	.dword CONFIG_ETRAX_MEM_GRP4_CONFIG
-	.dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp0)
-	.dword CONFIG_ETRAX_SDRAM_GRP0_CONFIG
-	.dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp1)
-	.dword CONFIG_ETRAX_SDRAM_GRP1_CONFIG
-	.dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_timing)
-	.dword CONFIG_ETRAX_SDRAM_TIMING
-	.dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_cmd)
-	.dword CONFIG_ETRAX_SDRAM_COMMAND
-
-	.dword REG_ADDR(gio, regi_gio, rw_pa_dout)
-	.dword CONFIG_ETRAX_DEF_GIO_PA_OUT
-	.dword REG_ADDR(gio, regi_gio, rw_pa_oe)
-	.dword CONFIG_ETRAX_DEF_GIO_PA_OE
-	.dword REG_ADDR(gio, regi_gio, rw_pb_dout)
-	.dword CONFIG_ETRAX_DEF_GIO_PB_OUT
-	.dword REG_ADDR(gio, regi_gio, rw_pb_oe)
-	.dword CONFIG_ETRAX_DEF_GIO_PB_OE
-	.dword REG_ADDR(gio, regi_gio, rw_pc_dout)
-	.dword CONFIG_ETRAX_DEF_GIO_PC_OUT
-	.dword REG_ADDR(gio, regi_gio, rw_pc_oe)
-	.dword CONFIG_ETRAX_DEF_GIO_PC_OE
-	.dword REG_ADDR(gio, regi_gio, rw_pd_dout)
-	.dword CONFIG_ETRAX_DEF_GIO_PD_OUT
-	.dword REG_ADDR(gio, regi_gio, rw_pd_oe)
-	.dword CONFIG_ETRAX_DEF_GIO_PD_OE
-	.dword REG_ADDR(gio, regi_gio, rw_pe_dout)
-	.dword CONFIG_ETRAX_DEF_GIO_PE_OUT
-	.dword REG_ADDR(gio, regi_gio, rw_pe_oe)
-	.dword CONFIG_ETRAX_DEF_GIO_PE_OE
-
-	.dword 0 ; No more register values
diff --git a/arch/cris/arch-v32/lib/memset.c b/arch/cris/arch-v32/lib/memset.c
index ffca1214674..c94ea9b3ec2 100644
--- a/arch/cris/arch-v32/lib/memset.c
+++ b/arch/cris/arch-v32/lib/memset.c
@@ -1,253 +1,259 @@
-/*#************************************************************************#*/
-/*#-------------------------------------------------------------------------*/
-/*#                                                                         */
-/*# FUNCTION NAME: memset()                                                 */
-/*#                                                                         */
-/*# PARAMETERS:  void* dst;   Destination address.                          */
-/*#              int     c;   Value of byte to write.                       */
-/*#              int   len;   Number of bytes to write.                     */
-/*#                                                                         */
-/*# RETURNS:     dst.                                                       */
-/*#                                                                         */
-/*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */
-/*#              Framework taken from memcpy.  This routine is              */
-/*#              very sensitive to compiler changes in register allocation. */
-/*#              Should really be rewritten to avoid this problem.          */
-/*#                                                                         */
-/*#-------------------------------------------------------------------------*/
-/*#                                                                         */
-/*# HISTORY                                                                 */
-/*#                                                                         */
-/*# DATE      NAME            CHANGES                                       */
-/*# ----      ----            -------                                       */
-/*# 990713    HP              Tired of watching this function (or           */
-/*#                           really, the nonoptimized generic              */
-/*#                           implementation) take up 90% of simulator      */
-/*#                           output.  Measurements needed.                 */
-/*#                                                                         */
-/*#-------------------------------------------------------------------------*/
-
-#include <linux/types.h>
-
-/* No, there's no macro saying 12*4, since it is "hard" to get it into
-   the asm in a good way.  Thus better to expose the problem everywhere.
-   */
-
-/* Assuming 1 cycle per dword written or read (ok, not really true), and
-   one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1)
-   so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */
-
-#define ZERO_BLOCK_SIZE (1*12*4)
-
-void *memset(void *pdst,
-             int c,
-             size_t plen)
+/* A memset for CRIS.
+   Copyright (C) 1999-2005 Axis Communications.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Neither the name of Axis Communications nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
+   COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+   IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.  */
+
+/* FIXME: This file should really only be used for reference, as the
+   result is somewhat depending on gcc generating what we expect rather
+   than what we describe.  An assembly file should be used instead.  */
+
+/* Note the multiple occurrence of the expression "12*4", including the
+   asm.  It is hard to get it into the asm in a good way.  Thus better to
+   expose the problem everywhere: no macro.  */
+
+/* Assuming one cycle per dword written or read (ok, not really true; the
+   world is not ideal), and one cycle per instruction, then 43+3*(n/48-1)
+   <= 24+24*(n/48-1) so n >= 45.7; n >= 0.9; we win on the first full
+   48-byte block to set.  */
+
+#define MEMSET_BY_BLOCK_THRESHOLD (1 * 48)
+
+/* No name ambiguities in this file.  */
+__asm__ (".syntax no_register_prefix");
+
+void *memset(void *pdst, int c, unsigned int plen)
 {
-  /* Ok.  Now we want the parameters put in special registers.
-     Make sure the compiler is able to make something useful of this. */
+  /* Now we want the parameters in special registers.  Make sure the
+     compiler does something usable with this.  */
 
   register char *return_dst __asm__ ("r10") = pdst;
   register int n __asm__ ("r12") = plen;
   register int lc __asm__ ("r11") = c;
 
-  /* Most apps use memset sanely.  Only those memsetting about 3..4
-     bytes or less get penalized compared to the generic implementation
-     - and that's not really sane use. */
+  /* Most apps use memset sanely.  Memsetting about 3..4 bytes or less get
+     penalized here compared to the generic implementation.  */
 
-  /* Ugh.  This is fragile at best.  Check with newer GCC releases, if
-     they compile cascaded "x |= x << 8" sanely! */
-  __asm__("movu.b %0,$r13	\n\
-           lslq 8,$r13		\n\
-	   move.b %0,$r13	\n\
-	   move.d $r13,%0	\n\
-	   lslq 16,$r13		\n\
-	   or.d $r13,%0"
-          : "=r" (lc) : "0" (lc) : "r13");
+  /* This is fragile performancewise at best.  Check with newer GCC
+     releases, if they compile cascaded "x |= x << 8" to sane code.  */
+  __asm__("movu.b %0,r13						\n\
+	   lslq 8,r13							\n\
+	   move.b %0,r13						\n\
+	   move.d r13,%0						\n\
+	   lslq 16,r13							\n\
+	   or.d r13,%0"
+          : "=r" (lc)		/* Inputs.  */
+	  : "0" (lc)		/* Outputs.  */
+	  : "r13");		/* Trash.  */
 
   {
     register char *dst __asm__ ("r13") = pdst;
 
-  /* This is NONPORTABLE, but since this whole routine is     */
-  /* grossly nonportable that doesn't matter.                 */
+    if (((unsigned long) pdst & 3) != 0
+	/* Oops! n = 0 must be a valid call, regardless of alignment.  */
+	&& n >= 3)
+      {
+	if ((unsigned long) dst & 1)
+	  {
+	    *dst = (char) lc;
+	    n--;
+	    dst++;
+	  }
 
-  if (((unsigned long) pdst & 3) != 0
-     /* Oops! n=0 must be a legal call, regardless of alignment. */
-      && n >= 3)
-  {
-    if ((unsigned long)dst & 1)
-    {
-      *dst = (char) lc;
-      n--;
-      dst++;
-    }
-
-    if ((unsigned long)dst & 2)
-    {
-      *(short *)dst = lc;
-      n -= 2;
-      dst += 2;
-    }
-  }
+	if ((unsigned long) dst & 2)
+	  {
+	    *(short *) dst = lc;
+	    n -= 2;
+	    dst += 2;
+	  }
+      }
 
-  /* Now the fun part.  For the threshold value of this, check the equation
-     above. */
-  /* Decide which copying method to use. */
-  if (n >= ZERO_BLOCK_SIZE)
-  {
-    /* For large copies we use 'movem' */
-
-  /* It is not optimal to tell the compiler about clobbering any
-     registers; that will move the saving/restoring of those registers
-     to the function prologue/epilogue, and make non-movem sizes
-     suboptimal.
-
-      This method is not foolproof; it assumes that the "asm reg"
-     declarations at the beginning of the function really are used
-     here (beware: they may be moved to temporary registers).
-      This way, we do not have to save/move the registers around into
-     temporaries; we can safely use them straight away.
-
-      If you want to check that the allocation was right; then
-      check the equalities in the first comment.  It should say
-      "r13=r13, r12=r12, r11=r11" */
-    __asm__ volatile ("							\n\
-        ;; Check that the register asm declaration got right.		\n\
-        ;; The GCC manual says it will work, but there *has* been bugs.	\n\
-	.ifnc %0-%1-%4,$r13-$r12-$r11					\n\
-	.err								\n\
-	.endif								\n\
+    /* Decide which setting method to use.  */
+    if (n >= MEMSET_BY_BLOCK_THRESHOLD)
+      {
+	/* It is not optimal to tell the compiler about clobbering any
+	   registers; that will move the saving/restoring of those registers
+	   to the function prologue/epilogue, and make non-block sizes
+	   suboptimal.  */
+	__asm__ volatile
+	  ("\
+	   ;; GCC does promise correct register allocations, but let's	\n\
+	   ;; make sure it keeps its promises.				\n\
+	   .ifnc %0-%1-%4,$r13-$r12-$r11				\n\
+	   .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\"	\n\
+	   .endif							\n\
 									\n\
-	;; Save the registers we'll clobber in the movem process	\n\
-	;; on the stack.  Don't mention them to gcc, it will only be	\n\
-	;; upset.							\n\
-	subq 	11*4,$sp						\n\
-        movem   $r10,[$sp]						\n\
+	   ;; Save the registers we'll clobber in the movem process	\n\
+	   ;; on the stack.  Don't mention them to gcc, it will only be	\n\
+	   ;; upset.							\n\
+	   subq	   11*4,sp						\n\
+	   movem   r10,[sp]						\n\
 									\n\
-        move.d  $r11,$r0						\n\
-        move.d  $r11,$r1						\n\
-        move.d  $r11,$r2						\n\
-        move.d  $r11,$r3						\n\
-        move.d  $r11,$r4						\n\
-        move.d  $r11,$r5						\n\
-        move.d  $r11,$r6						\n\
-        move.d  $r11,$r7						\n\
-        move.d  $r11,$r8						\n\
-        move.d  $r11,$r9						\n\
-        move.d  $r11,$r10						\n\
+	   move.d  r11,r0						\n\
+	   move.d  r11,r1						\n\
+	   move.d  r11,r2						\n\
+	   move.d  r11,r3						\n\
+	   move.d  r11,r4						\n\
+	   move.d  r11,r5						\n\
+	   move.d  r11,r6						\n\
+	   move.d  r11,r7						\n\
+	   move.d  r11,r8						\n\
+	   move.d  r11,r9						\n\
+	   move.d  r11,r10						\n\
 									\n\
-        ;; Now we've got this:						\n\
-	;; r13 - dst							\n\
-	;; r12 - n							\n\
+	   ;; Now we've got this:					\n\
+	   ;; r13 - dst							\n\
+	   ;; r12 - n							\n\
 									\n\
-        ;; Update n for the first loop					\n\
-        subq    12*4,$r12						\n\
+	   ;; Update n for the first loop				\n\
+	   subq	   12*4,r12						\n\
 0:									\n\
-        subq   12*4,$r12						\n\
-        bge     0b							\n\
-	movem	$r11,[$r13+]						\n\
+"
+#ifdef __arch_common_v10_v32
+	   /* Cater to branch offset difference between v32 and v10.  We
+	      assume the branch below has an 8-bit offset.  */
+"	   setf\n"
+#endif
+"	   subq	  12*4,r12						\n\
+	   bge	   0b							\n\
+	   movem	r11,[r13+]					\n\
 									\n\
-        addq   12*4,$r12  ;; compensate for last loop underflowing n	\n\
+	   ;; Compensate for last loop underflowing n.			\n\
+	   addq	  12*4,r12						\n\
 									\n\
-	;; Restore registers from stack					\n\
-        movem [$sp+],$r10"
+	   ;; Restore registers from stack.				\n\
+	   movem [sp+],r10"
 
-     /* Outputs */ : "=r" (dst), "=r" (n)
-     /* Inputs */ : "0" (dst), "1" (n), "r" (lc));
-  }
+	   /* Outputs.	*/
+	   : "=r" (dst), "=r" (n)
+
+	   /* Inputs.  */
+	   : "0" (dst), "1" (n), "r" (lc));
+      }
+
+    /* An ad-hoc unroll, used for 4*12-1..16 bytes. */
+    while (n >= 16)
+      {
+	*(long *) dst = lc; dst += 4;
+	*(long *) dst = lc; dst += 4;
+	*(long *) dst = lc; dst += 4;
+	*(long *) dst = lc; dst += 4;
+	n -= 16;
+      }
 
-    /* Either we directly starts copying, using dword copying
-       in a loop, or we copy as much as possible with 'movem'
-       and then the last block (<44 bytes) is copied here.
-       This will work since 'movem' will have updated src,dst,n. */
-
-    while ( n >= 16 )
-    {
-      *((long*)dst)++ = lc;
-      *((long*)dst)++ = lc;
-      *((long*)dst)++ = lc;
-      *((long*)dst)++ = lc;
-      n -= 16;
-    }
-
-    /* A switch() is definitely the fastest although it takes a LOT of code.
-     * Particularly if you inline code this.
-     */
     switch (n)
-    {
+      {
       case 0:
         break;
+
       case 1:
-        *(char*)dst = (char) lc;
+        *dst = (char) lc;
         break;
+
       case 2:
-        *(short*)dst = (short) lc;
+        *(short *) dst = (short) lc;
         break;
+
       case 3:
-        *((short*)dst)++ = (short) lc;
-        *(char*)dst = (char) lc;
+        *(short *) dst = (short) lc; dst += 2;
+        *dst = (char) lc;
         break;
+
       case 4:
-        *((long*)dst)++ = lc;
+        *(long *) dst = lc;
         break;
+
       case 5:
-        *((long*)dst)++ = lc;
-        *(char*)dst = (char) lc;
+        *(long *) dst = lc; dst += 4;
+        *dst = (char) lc;
         break;
+
       case 6:
-        *((long*)dst)++ = lc;
-        *(short*)dst = (short) lc;
+        *(long *) dst = lc; dst += 4;
+        *(short *) dst = (short) lc;
         break;
+
       case 7:
-        *((long*)dst)++ = lc;
-        *((short*)dst)++ = (short) lc;
-        *(char*)dst = (char) lc;
+        *(long *) dst = lc; dst += 4;
+        *(short *) dst = (short) lc; dst += 2;
+        *dst = (char) lc;
         break;
+
       case 8:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc;
         break;
+
       case 9:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *(char*)dst = (char) lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *dst = (char) lc;
         break;
+
       case 10:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *(short*)dst = (short) lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(short *) dst = (short) lc;
         break;
+
       case 11:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *((short*)dst)++ = (short) lc;
-        *(char*)dst = (char) lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(short *) dst = (short) lc; dst += 2;
+        *dst = (char) lc;
         break;
+
       case 12:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc;
         break;
+
       case 13:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *(char*)dst = (char) lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *dst = (char) lc;
         break;
+
       case 14:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *(short*)dst = (short) lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(short *) dst = (short) lc;
         break;
+
       case 15:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *((short*)dst)++ = (short) lc;
-        *(char*)dst = (char) lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(short *) dst = (short) lc; dst += 2;
+        *dst = (char) lc;
         break;
-    }
+      }
   }
 
-  return return_dst; /* destination pointer. */
-} /* memset() */
+  return return_dst;
+}
diff --git a/arch/cris/arch-v32/lib/nand_init.S b/arch/cris/arch-v32/lib/nand_init.S
deleted file mode 100644
index e019816facd..00000000000
--- a/arch/cris/arch-v32/lib/nand_init.S
+++ /dev/null
@@ -1,178 +0,0 @@
-##=============================================================================
-##
-##      nand_init.S
-##
-##      The bootrom copies data from the NAND flash to the internal RAM but
-##      due to a bug/feature we can only trust the 256 first bytes. So this
-##      code copies more data from NAND flash to internal RAM. Obvioulsy this
-##      code must fit in the first 256 bytes so alter with care.
-##
-##	Some notes about the bug/feature for future reference:
-##        The bootrom copies the first 127 KB from NAND flash to internal
-##        memory. The problem is that it does a bytewise copy. NAND flashes
-##        does autoincrement on the address so for a 16-bite device each
-##        read/write increases the address by two. So the copy loop in the
-##        bootrom will discard every second byte. This is solved by inserting
-##        zeroes in every second byte in the first erase block.
-##
-##        The bootrom also incorrectly assumes that it can read the flash
-##        linear with only one read command but the flash will actually
-##        switch between normal area and spare area if you do that so we
-##        can't trust more than the first 256 bytes.
-##
-##=============================================================================
-
-#include <asm/arch/hwregs/asm/reg_map_asm.h>
-#include <asm/arch/hwregs/asm/gio_defs_asm.h>
-#include <asm/arch/hwregs/asm/pinmux_defs_asm.h>
-#include <asm/arch/hwregs/asm/bif_core_defs_asm.h>
-#include <asm/arch/hwregs/asm/config_defs_asm.h>
-
-;; There are 8-bit NAND flashes and 16-bit NAND flashes.
-;; We need to treat them slightly different.
-#if CONFIG_ETRAX_FLASH_BUSWIDTH==2
-#define PAGE_SIZE 256
-#else
-#error 2
-#define PAGE_SIZE 512
-#endif
-#define ERASE_BLOCK 16384
-
-;; GPIO pins connected to NAND flash
-#define CE 4
-#define CLE 5
-#define ALE 6
-#define BY 7
-
-;; Address space for NAND flash
-#define NAND_RD_ADDR 0x90000000
-#define NAND_WR_ADDR 0x94000000
-
-#define READ_CMD 0x00
-
-;; Readability macros
-#define CSP_MASK \
-	REG_MASK(bif_core, rw_grp3_cfg, gated_csp0) | \
-	REG_MASK(bif_core, rw_grp3_cfg, gated_csp1)
-#define CSP_VAL \
-	REG_STATE(bif_core, rw_grp3_cfg, gated_csp0, rd) | \
-	REG_STATE(bif_core, rw_grp3_cfg, gated_csp1, wr)
-
-;;----------------------------------------------------------------------------
-;; Macros to set/clear GPIO bits
-
-.macro SET x
-	or.b   (1<<\x),$r9
-	move.d $r9, [$r2]
-.endm
-
-.macro CLR x
-	and.b  ~(1<<\x),$r9
-	move.d $r9, [$r2]
-.endm
-
-;;----------------------------------------------------------------------------
-
-nand_boot:
-	;; Check if nand boot was selected
-	move.d REG_ADDR(config, regi_config, r_bootsel), $r0
-	move.d [$r0], $r0
-	and.d  REG_MASK(config, r_bootsel, boot_mode), $r0
-	cmp.d  REG_STATE(config, r_bootsel, boot_mode, nand), $r0
-	bne normal_boot ; No NAND boot
-	nop
-
-copy_nand_to_ram:
-	;; copy_nand_to_ram
-	;; Arguments
-	;;   r10 - destination
-	;;   r11 - source offset
-	;;   r12 - size
-	;;   r13 - Address to jump to after completion
-	;; Note : r10-r12 are clobbered on return
-	;; Registers used:
-	;;   r0 - NAND_RD_ADDR
-	;;   r1 - NAND_WR_ADDR
-	;;   r2 - reg_gio_rw_pa_dout
-	;;   r3 - reg_gio_r_pa_din
-	;;   r4 - tmp
-	;;   r5 - byte counter within a page
-	;;   r6 - reg_pinmux_rw_pa
-	;;   r7 - reg_gio_rw_pa_oe
-	;;   r8 - reg_bif_core_rw_grp3_cfg
-	;;   r9 - reg_gio_rw_pa_dout shadow
-	move.d 0x90000000, $r0
-	move.d 0x94000000, $r1
-	move.d REG_ADDR(gio, regi_gio, rw_pa_dout), $r2
-	move.d REG_ADDR(gio, regi_gio, r_pa_din), $r3
-	move.d REG_ADDR(pinmux, regi_pinmux, rw_pa), $r6
-	move.d REG_ADDR(gio, regi_gio, rw_pa_oe), $r7
-	move.d REG_ADDR(bif_core, regi_bif_core, rw_grp3_cfg), $r8
-
-#if CONFIG_ETRAX_FLASH_BUSWIDTH==2
-	lsrq	1, $r11
-#endif
-	;; Set up GPIO
-	move.d [$r2], $r9
-	move.d [$r7], $r4
-	or.b (1<<ALE) | (1 << CLE) | (1<<CE), $r4
-	move.d $r4, [$r7]
-
-	;; Set up bif
-	move.d [$r8], $r4
-	and.d CSP_MASK, $r4
-	or.d CSP_VAL, $r4
-	move.d $r4, [$r8]
-
-1:	;; Copy one page
-	CLR CE
-	SET CLE
-	moveq	READ_CMD, $r4
-	move.b	$r4, [$r1]
-	moveq	20, $r4
-2:	bne	2b
-	subq	1, $r4
-	CLR CLE
-	SET ALE
-	clear.w [$r1] 		; Column address = 0
-	move.d	$r11, $r4
-	lsrq	8, $r4
-	move.b	$r4, [$r1]	; Row address
-	lsrq	8, $r4
-	move.b	$r4, [$r1]	; Row adddress
-	moveq	20, $r4
-2:	bne	2b
-	subq	1, $r4
-	CLR ALE
-2:	move.d	[$r3], $r4
-	and.d	1 << BY, $r4
-	beq 2b
-	movu.w  PAGE_SIZE, $r5
-2:	; Copy one byte/word
-#if CONFIG_ETRAX_FLASH_BUSWIDTH==2
-	move.w  [$r0], $r4
-#else
-	move.b  [$r0], $r4
-#endif
-	subq	1, $r5
-	bne	2b
-#if CONFIG_ETRAX_FLASH_BUSWIDTH==2
-	move.w  $r4, [$r10+]
-	subu.w	PAGE_SIZE*2, $r12
-#else
-	move.b  $r4, [$r10+]
-	subu.w	PAGE_SIZE, $r12
-#endif
-	bpl	1b
-	addu.w	PAGE_SIZE, $r11
-
-	;; End of copy
-	jump	$r13
-	nop
-
-	;; This will warn if the code above is too large. If you consider
-	;; to remove this you don't understand the bug/feature.
-	.org 256
-	.org ERASE_BLOCK
-
-normal_boot:
diff --git a/arch/cris/arch-v32/lib/spinlock.S b/arch/cris/arch-v32/lib/spinlock.S
index 2437ae7f6ed..fe610b9d775 100644
--- a/arch/cris/arch-v32/lib/spinlock.S
+++ b/arch/cris/arch-v32/lib/spinlock.S
@@ -6,28 +6,35 @@
 
 
 	.global cris_spin_lock
+	.type   cris_spin_lock,@function
 	.global cris_spin_trylock
+	.type   cris_spin_trylock,@function
 
 	.text
 
 cris_spin_lock:
 	clearf	p
-1:	test.d	[$r10]
+1:	test.b	[$r10]
 	beq	1b
 	clearf	p
 	ax
-	clear.d [$r10]
+	clear.b [$r10]
 	bcs     1b
 	clearf	p
 	ret
 	nop
 
+	.size   cris_spin_lock, . - cris_spin_lock
+
 cris_spin_trylock:
 	clearf	p
-1:	move.d	[$r10], $r11
+1:	move.b	[$r10], $r11
 	ax
-	clear.d [$r10]
+	clear.b [$r10]
         bcs	1b
         clearf	p
 	ret
-	move.d	$r11,$r10
+	movu.b	$r11,$r10
+
+	.size   cris_spin_trylock, . - cris_spin_trylock
+
diff --git a/arch/cris/arch-v32/lib/strcmp.S b/arch/cris/arch-v32/lib/strcmp.S
new file mode 100644
index 00000000000..8f7a1ee6259
--- /dev/null
+++ b/arch/cris/arch-v32/lib/strcmp.S
@@ -0,0 +1,21 @@
+; strcmp.S -- CRISv32 version.
+; Copyright (C) 2008 AXIS Communications AB
+; Written by Edgar E. Iglesias
+;
+; This source code is licensed under the GNU General Public License,
+; Version 2.  See the file COPYING for more details.
+
+	.global	strcmp
+	.type	strcmp,@function
+strcmp:
+1:
+	move.b	[$r10+], $r12
+	seq	$r13
+	sub.b	[$r11+], $r12
+	or.b	$r12, $r13
+	beq	1b
+	nop
+
+	ret
+	movs.b	$r12, $r10
+	.size	strcmp, . - strcmp
diff --git a/arch/cris/arch-v32/lib/string.c b/arch/cris/arch-v32/lib/string.c
index 98e282ac824..c7bd6ebdc93 100644
--- a/arch/cris/arch-v32/lib/string.c
+++ b/arch/cris/arch-v32/lib/string.c
@@ -1,55 +1,59 @@
-/*#************************************************************************#*/
-/*#-------------------------------------------------------------------------*/
-/*#                                                                         */
-/*# FUNCTION NAME: memcpy()                                                 */
-/*#                                                                         */
-/*# PARAMETERS:  void* dst;   Destination address.                          */
-/*#              void* src;   Source address.                               */
-/*#              int   len;   Number of bytes to copy.                      */
-/*#                                                                         */
-/*# RETURNS:     dst.                                                       */
-/*#                                                                         */
-/*# DESCRIPTION: Copies len bytes of memory from src to dst.  No guarantees */
-/*#              about copying of overlapping memory areas. This routine is */
-/*#              very sensitive to compiler changes in register allocation. */
-/*#              Should really be rewritten to avoid this problem.          */
-/*#                                                                         */
-/*#-------------------------------------------------------------------------*/
-/*#                                                                         */
-/*# HISTORY                                                                 */
-/*#                                                                         */
-/*# DATE      NAME            CHANGES                                       */
-/*# ----      ----            -------                                       */
-/*# 941007    Kenny R         Creation                                      */
-/*# 941011    Kenny R         Lots of optimizations and inlining.           */
-/*# 941129    Ulf A           Adapted for use in libc.                      */
-/*# 950216    HP              N==0 forgotten if non-aligned src/dst.        */
-/*#                           Added some optimizations.                     */
-/*# 001025    HP              Make src and dst char *.  Align dst to	    */
-/*#			      dword, not just word-if-both-src-and-dst-	    */
-/*#			      are-misaligned.				    */
-/*#                                                                         */
-/*#-------------------------------------------------------------------------*/
-
-#include <linux/types.h>
-
-void *memcpy(void *pdst,
-             const void *psrc,
-             size_t pn)
+/* A memcpy for CRIS.
+   Copyright (C) 1994-2005 Axis Communications.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Neither the name of Axis Communications nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
+   COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+   IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.  */
+
+/* FIXME: This file should really only be used for reference, as the
+   result is somewhat depending on gcc generating what we expect rather
+   than what we describe.  An assembly file should be used instead.  */
+
+#include <stddef.h>
+
+/* Break even between movem and move16 is really at 38.7 * 2, but
+   modulo 44, so up to the next multiple of 44, we use ordinary code.  */
+#define MEMCPY_BY_BLOCK_THRESHOLD (44 * 2)
+
+/* No name ambiguities in this file.  */
+__asm__ (".syntax no_register_prefix");
+
+void *
+memcpy(void *pdst, const void *psrc, size_t pn)
 {
-  /* Ok.  Now we want the parameters put in special registers.
+  /* Now we want the parameters put in special registers.
      Make sure the compiler is able to make something useful of this.
-      As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
+     As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
 
      If gcc was allright, it really would need no temporaries, and no
-     stack space to save stuff on. */
+     stack space to save stuff on.  */
 
   register void *return_dst __asm__ ("r10") = pdst;
-  register char *dst __asm__ ("r13") = pdst;
-  register const char *src __asm__ ("r11") = psrc;
+  register unsigned char *dst __asm__ ("r13") = pdst;
+  register unsigned const char *src __asm__ ("r11") = psrc;
   register int n __asm__ ("r12") = pn;
 
-
   /* When src is aligned but not dst, this makes a few extra needless
      cycles.  I believe it would take as many to check that the
      re-alignment was unnecessary.  */
@@ -59,161 +63,174 @@ void *memcpy(void *pdst,
       && n >= 3)
   {
     if ((unsigned long) dst & 1)
-    {
-      n--;
-      *(char*)dst = *(char*)src;
-      src++;
-      dst++;
-    }
+      {
+	n--;
+	*dst = *src;
+	src++;
+	dst++;
+      }
 
     if ((unsigned long) dst & 2)
-    {
-      n -= 2;
-      *(short*)dst = *(short*)src;
-      src += 2;
-      dst += 2;
-    }
+      {
+	n -= 2;
+	*(short *) dst = *(short *) src;
+	src += 2;
+	dst += 2;
+      }
   }
 
-  /* Decide which copying method to use.  Movem is dirt cheap, so the
-     overheap is low enough to always use the minimum block size as the
-     threshold.  */
-  if (n >= 44)
-  {
-    /* For large copies we use 'movem' */
-
-  /* It is not optimal to tell the compiler about clobbering any
-     registers; that will move the saving/restoring of those registers
-     to the function prologue/epilogue, and make non-movem sizes
-     suboptimal.  */
-    __asm__ volatile ("							\n\
-        ;; Check that the register asm declaration got right.		\n\
-        ;; The GCC manual explicitly says TRT will happen.		\n\
-	.ifnc %0-%1-%2,$r13-$r11-$r12					\n\
-	.err								\n\
-	.endif								\n\
-									\n\
-	;; Save the registers we'll use in the movem process		\n\
+  /* Decide which copying method to use.  */
+  if (n >= MEMCPY_BY_BLOCK_THRESHOLD)
+    {
+      /* It is not optimal to tell the compiler about clobbering any
+	 registers; that will move the saving/restoring of those registers
+	 to the function prologue/epilogue, and make non-movem sizes
+	 suboptimal.  */
+      __asm__ volatile
+	("\
+	 ;; GCC does promise correct register allocations, but let's	\n\
+	 ;; make sure it keeps its promises.				\n\
+	 .ifnc %0-%1-%2,$r13-$r11-$r12					\n\
+	 .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\"	\n\
+	 .endif								\n\
 									\n\
-	;; on the stack.						\n\
-	subq 	11*4,$sp						\n\
-	movem	$r10,[$sp]						\n\
+	 ;; Save the registers we'll use in the movem process		\n\
+	 ;; on the stack.						\n\
+	 subq	11*4,sp							\n\
+	 movem	r10,[sp]						\n\
 									\n\
-        ;; Now we've got this:						\n\
-	;; r11 - src							\n\
-	;; r13 - dst							\n\
-	;; r12 - n							\n\
+	 ;; Now we've got this:						\n\
+	 ;; r11 - src							\n\
+	 ;; r13 - dst							\n\
+	 ;; r12 - n							\n\
 									\n\
-        ;; Update n for the first loop					\n\
-        subq    44,$r12							\n\
+	 ;; Update n for the first loop.				\n\
+	 subq	 44,r12							\n\
 0:									\n\
-	movem	[$r11+],$r10						\n\
-        subq   44,$r12							\n\
-        bge     0b							\n\
-	movem	$r10,[$r13+]						\n\
+"
+#ifdef __arch_common_v10_v32
+	 /* Cater to branch offset difference between v32 and v10.  We
+	    assume the branch below has an 8-bit offset.  */
+"	 setf\n"
+#endif
+"	 movem	[r11+],r10						\n\
+	 subq	44,r12							\n\
+	 bge	 0b							\n\
+	 movem	r10,[r13+]						\n\
 									\n\
-        addq   44,$r12  ;; compensate for last loop underflowing n	\n\
+	 ;; Compensate for last loop underflowing n.			\n\
+	 addq	44,r12							\n\
 									\n\
-	;; Restore registers from stack					\n\
-        movem [$sp+],$r10"
+	 ;; Restore registers from stack.				\n\
+	 movem [sp+],r10"
 
-     /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n)
-     /* Inputs */ : "0" (dst), "1" (src), "2" (n));
+	 /* Outputs.  */
+	 : "=r" (dst), "=r" (src), "=r" (n)
 
-  }
+	 /* Inputs.  */
+	 : "0" (dst), "1" (src), "2" (n));
+    }
 
-  /* Either we directly starts copying, using dword copying
-     in a loop, or we copy as much as possible with 'movem'
-     and then the last block (<44 bytes) is copied here.
-     This will work since 'movem' will have updated src,dst,n. */
+  while (n >= 16)
+    {
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
 
-  while ( n >= 16 )
-  {
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    n -= 16;
-  }
+      n -= 16;
+    }
 
-  /* A switch() is definitely the fastest although it takes a LOT of code.
-   * Particularly if you inline code this.
-   */
   switch (n)
-  {
+    {
     case 0:
       break;
+
     case 1:
-      *(char*)dst = *(char*)src;
+      *dst = *src;
       break;
+
     case 2:
-      *(short*)dst = *(short*)src;
+      *(short *) dst = *(short *) src;
       break;
+
     case 3:
-      *((short*)dst)++ = *((short*)src)++;
-      *(char*)dst = *(char*)src;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
+
     case 4:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src;
       break;
+
     case 5:
-      *((long*)dst)++ = *((long*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *dst = *src;
       break;
+
     case 6:
-      *((long*)dst)++ = *((long*)src)++;
-      *(short*)dst = *(short*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src;
       break;
+
     case 7:
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
+
     case 8:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src;
       break;
+
     case 9:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *dst = *src;
       break;
+
     case 10:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *(short*)dst = *(short*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src;
       break;
+
     case 11:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
+
     case 12:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src;
       break;
+
     case 13:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *dst = *src;
       break;
+
     case 14:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *(short*)dst = *(short*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src;
       break;
+
     case 15:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
-      *(char*)dst = *(char*)src;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
-  }
+    }
 
-  return return_dst; /* destination pointer. */
-} /* memcpy() */
+  return return_dst;
+}
diff --git a/arch/cris/arch-v32/lib/usercopy.c b/arch/cris/arch-v32/lib/usercopy.c
index f0b08460c1b..0b5b70d5f58 100644
--- a/arch/cris/arch-v32/lib/usercopy.c
+++ b/arch/cris/arch-v32/lib/usercopy.c
@@ -34,7 +34,7 @@ __copy_user (void __user *pdst, const void *psrc, unsigned long pn)
      As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
 
      FIXME: Comment for old gcc version.  Check.
-     If gcc was allright, it really would need no temporaries, and no
+     If gcc was alright, it really would need no temporaries, and no
      stack space to save stuff on. */
 
   register char *dst __asm__ ("r13") = pdst;
@@ -161,14 +161,14 @@ __copy_user (void __user *pdst, const void *psrc, unsigned long pn)
    inaccessible.  */
 
 unsigned long
-__copy_user_zeroing (void __user *pdst, const void *psrc, unsigned long pn)
+__copy_user_zeroing(void *pdst, const void __user *psrc, unsigned long pn)
 {
   /* We want the parameters put in special registers.
      Make sure the compiler is able to make something useful of this.
      As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
 
      FIXME: Comment for old gcc version.  Check.
-     If gcc was allright, it really would need no temporaries, and no
+     If gcc was alright, it really would need no temporaries, and no
      stack space to save stuff on.  */
 
   register char *dst __asm__ ("r13") = pdst;
@@ -332,7 +332,7 @@ __do_clear_user (void __user *pto, unsigned long pn)
       As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
 
      FIXME: Comment for old gcc version.  Check.
-     If gcc was allright, it really would need no temporaries, and no
+     If gcc was alright, it really would need no temporaries, and no
      stack space to save stuff on. */
 
   register char *dst __asm__ ("r13") = pto;