aboutsummaryrefslogtreecommitdiff
path: root/arch/cris/arch-v32/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/cris/arch-v32/lib')
-rw-r--r--arch/cris/arch-v32/lib/Makefile3
-rw-r--r--arch/cris/arch-v32/lib/checksum.S75
-rw-r--r--arch/cris/arch-v32/lib/checksumcopy.S72
-rw-r--r--arch/cris/arch-v32/lib/delay.c28
-rw-r--r--arch/cris/arch-v32/lib/dram_init.S119
-rw-r--r--arch/cris/arch-v32/lib/hw_settings.S72
-rw-r--r--arch/cris/arch-v32/lib/memset.c398
-rw-r--r--arch/cris/arch-v32/lib/nand_init.S178
-rw-r--r--arch/cris/arch-v32/lib/spinlock.S17
-rw-r--r--arch/cris/arch-v32/lib/strcmp.S21
-rw-r--r--arch/cris/arch-v32/lib/string.c323
-rw-r--r--arch/cris/arch-v32/lib/usercopy.c8
12 files changed, 488 insertions, 826 deletions
diff --git a/arch/cris/arch-v32/lib/Makefile b/arch/cris/arch-v32/lib/Makefile
index 05b3ec6978d..dd296b9db03 100644
--- a/arch/cris/arch-v32/lib/Makefile
+++ b/arch/cris/arch-v32/lib/Makefile
@@ -2,5 +2,6 @@
# Makefile for Etrax-specific library files..
#
-lib-y = checksum.o checksumcopy.o string.o usercopy.o memset.o csumcpfruser.o spinlock.o
+lib-y = checksum.o checksumcopy.o string.o usercopy.o memset.o \
+ csumcpfruser.o spinlock.o delay.o strcmp.o
diff --git a/arch/cris/arch-v32/lib/checksum.S b/arch/cris/arch-v32/lib/checksum.S
index 32e66181b82..4a72a94a49a 100644
--- a/arch/cris/arch-v32/lib/checksum.S
+++ b/arch/cris/arch-v32/lib/checksum.S
@@ -1,41 +1,35 @@
/*
* A fast checksum routine using movem
- * Copyright (c) 1998-2001, 2003 Axis Communications AB
+ * Copyright (c) 1998-2007 Axis Communications AB
*
* csum_partial(const unsigned char * buff, int len, unsigned int sum)
*/
.globl csum_partial
+ .type csum_partial,@function
csum_partial:
;; r10 - src
;; r11 - length
;; r12 - checksum
- ;; check for breakeven length between movem and normal word looping versions
- ;; we also do _NOT_ want to compute a checksum over more than the
- ;; actual length when length < 40
-
- cmpu.w 80,$r11
- blo _word_loop
- nop
-
- ;; need to save the registers we use below in the movem loop
- ;; this overhead is why we have a check above for breakeven length
- ;; only r0 - r8 have to be saved, the other ones are clobber-able
- ;; according to the ABI
+ ;; Optimized for large packets
+ subq 10*4, $r11
+ blt _word_loop
+ move.d $r11, $acr
subq 9*4,$sp
- subq 10*4,$r11 ; update length for the first loop
+ clearf c
movem $r8,[$sp]
;; do a movem checksum
_mloop: movem [$r10+],$r9 ; read 10 longwords
-
+ ;; Loop count without touching the c flag.
+ addoq -10*4, $acr, $acr
;; perform dword checksumming on the 10 longwords
- add.d $r0,$r12
+ addc $r0,$r12
addc $r1,$r12
addc $r2,$r12
addc $r3,$r12
@@ -46,60 +40,41 @@ _mloop: movem [$r10+],$r9 ; read 10 longwords
addc $r8,$r12
addc $r9,$r12
- ;; fold the carry into the checksum, to avoid having to loop the carry
- ;; back into the top
-
- addc 0,$r12
- addc 0,$r12 ; do it again, since we might have generated a carry
-
- subq 10*4,$r11
- bge _mloop
- nop
-
- addq 10*4,$r11 ; compensate for last loop underflowing length
+ ;; test $acr without trashing carry.
+ move.d $acr, $acr
+ bpl _mloop
+ ;; r11 <= acr is not really needed in the mloop, just using the dslot
+ ;; to prepare for what is needed after mloop.
+ move.d $acr, $r11
+ ;; fold the last carry into r13
+ addc 0, $r12
movem [$sp+],$r8 ; restore regs
_word_loop:
- ;; only fold if there is anything to fold.
-
- cmpq 0,$r12
- beq _no_fold
-
- ;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below.
- ;; r9 and r13 can be used as temporaries.
+ addq 10*4,$r11 ; compensate for last loop underflowing length
moveq -1,$r9 ; put 0xffff in r9, faster than move.d 0xffff,r9
lsrq 16,$r9
move.d $r12,$r13
lsrq 16,$r13 ; r13 = checksum >> 16
- and.d $r9,$r12 ; checksum = checksum & 0xffff
- add.d $r13,$r12 ; checksum += r13
- move.d $r12,$r13 ; do the same again, maybe we got a carry last add
- lsrq 16,$r13
- and.d $r9,$r12
- add.d $r13,$r12
+ and.d $r9,$r12 ; checksum = checksum & 0xffff
_no_fold:
- cmpq 2,$r11
+ subq 2,$r11
blt _no_words
- nop
+ add.d $r13,$r12 ; checksum += r13
;; checksum the rest of the words
-
- subq 2,$r11
-
_wloop: subq 2,$r11
bge _wloop
addu.w [$r10+],$r12
- addq 2,$r11
-
_no_words:
+ addq 2,$r11
;; see if we have one odd byte more
- cmpq 1,$r11
- beq _do_byte
+ bne _do_byte
nop
ret
move.d $r12,$r10
@@ -109,3 +84,5 @@ _do_byte:
addu.b [$r10],$r12
ret
move.d $r12,$r10
+
+ .size csum_partial, .-csum_partial
diff --git a/arch/cris/arch-v32/lib/checksumcopy.S b/arch/cris/arch-v32/lib/checksumcopy.S
index 9303ccbadc6..54e209f18b0 100644
--- a/arch/cris/arch-v32/lib/checksumcopy.S
+++ b/arch/cris/arch-v32/lib/checksumcopy.S
@@ -1,6 +1,6 @@
/*
* A fast checksum+copy routine using movem
- * Copyright (c) 1998, 2001, 2003 Axis Communications AB
+ * Copyright (c) 1998-2007 Axis Communications AB
*
* Authors: Bjorn Wesen
*
@@ -9,6 +9,7 @@
*/
.globl csum_partial_copy_nocheck
+ .type csum_partial_copy_nocheck,@function
csum_partial_copy_nocheck:
;; r10 - src
@@ -16,32 +17,23 @@ csum_partial_copy_nocheck:
;; r12 - length
;; r13 - checksum
- ;; check for breakeven length between movem and normal word looping versions
- ;; we also do _NOT_ want to compute a checksum over more than the
- ;; actual length when length < 40
-
- cmpu.w 80,$r12
- blo _word_loop
- nop
-
- ;; need to save the registers we use below in the movem loop
- ;; this overhead is why we have a check above for breakeven length
- ;; only r0 - r8 have to be saved, the other ones are clobber-able
- ;; according to the ABI
+ ;; Optimized for large packets
+ subq 10*4, $r12
+ blt _word_loop
+ move.d $r12, $acr
subq 9*4,$sp
- subq 10*4,$r12 ; update length for the first loop
+ clearf c
movem $r8,[$sp]
;; do a movem copy and checksum
-
1: ;; A failing userspace access (the read) will have this as PC.
_mloop: movem [$r10+],$r9 ; read 10 longwords
+ addoq -10*4, $acr, $acr ; loop counter in latency cycle
movem $r9,[$r11+] ; write 10 longwords
;; perform dword checksumming on the 10 longwords
-
- add.d $r0,$r13
+ addc $r0,$r13
addc $r1,$r13
addc $r2,$r13
addc $r3,$r13
@@ -52,47 +44,30 @@ _mloop: movem [$r10+],$r9 ; read 10 longwords
addc $r8,$r13
addc $r9,$r13
- ;; fold the carry into the checksum, to avoid having to loop the carry
- ;; back into the top
-
- addc 0,$r13
- addc 0,$r13 ; do it again, since we might have generated a carry
-
- subq 10*4,$r12
- bge _mloop
- nop
-
- addq 10*4,$r12 ; compensate for last loop underflowing length
+ ;; test $acr, without trashing carry.
+ move.d $acr, $acr
+ bpl _mloop
+ ;; r12 <= acr is needed after mloop and in the exception handlers.
+ move.d $acr, $r12
+ ;; fold the last carry into r13
+ addc 0, $r13
movem [$sp+],$r8 ; restore regs
_word_loop:
- ;; only fold if there is anything to fold.
-
- cmpq 0,$r13
- beq _no_fold
+ addq 10*4,$r12 ; compensate for last loop underflowing length
;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below
;; r9 can be used as temporary.
-
move.d $r13,$r9
lsrq 16,$r9 ; r0 = checksum >> 16
and.d 0xffff,$r13 ; checksum = checksum & 0xffff
- add.d $r9,$r13 ; checksum += r0
- move.d $r13,$r9 ; do the same again, maybe we got a carry last add
- lsrq 16,$r9
- and.d 0xffff,$r13
- add.d $r9,$r13
-_no_fold:
- cmpq 2,$r12
+ subq 2, $r12
blt _no_words
- nop
+ add.d $r9,$r13 ; checksum += r0
;; copy and checksum the rest of the words
-
- subq 2,$r12
-
2: ;; A failing userspace access for the read below will have this as PC.
_wloop: move.w [$r10+],$r9
addu.w $r9,$r13
@@ -100,12 +75,9 @@ _wloop: move.w [$r10+],$r9
bge _wloop
move.w $r9,[$r11+]
- addq 2,$r12
-
_no_words:
- ;; see if we have one odd byte more
- cmpq 1,$r12
- beq _do_byte
+ addq 2,$r12
+ bne _do_byte
nop
ret
move.d $r13,$r10
@@ -118,3 +90,5 @@ _do_byte:
move.b $r9,[$r11]
ret
move.d $r13,$r10
+
+ .size csum_partial_copy_nocheck, . - csum_partial_copy_nocheck
diff --git a/arch/cris/arch-v32/lib/delay.c b/arch/cris/arch-v32/lib/delay.c
new file mode 100644
index 00000000000..39f1ac9995b
--- /dev/null
+++ b/arch/cris/arch-v32/lib/delay.c
@@ -0,0 +1,28 @@
+/*
+ * Precise Delay Loops for ETRAX FS
+ *
+ * Copyright (C) 2006 Axis Communications AB.
+ *
+ */
+
+#include <hwregs/reg_map.h>
+#include <hwregs/reg_rdwr.h>
+#include <hwregs/timer_defs.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+
+/*
+ * On ETRAX FS, we can check the free-running read-only 100MHz timer
+ * getting 32-bit 10ns precision, theoretically good for 42.94967295
+ * seconds. Unsigned arithmetic and careful expression handles
+ * wrapping.
+ */
+
+void cris_delay10ns(u32 n10ns)
+{
+ u32 t0 = REG_RD(timer, regi_timer0, r_time);
+ while (REG_RD(timer, regi_timer0, r_time) - t0 < n10ns)
+ ;
+}
+EXPORT_SYMBOL(cris_delay10ns);
diff --git a/arch/cris/arch-v32/lib/dram_init.S b/arch/cris/arch-v32/lib/dram_init.S
deleted file mode 100644
index 158b3dbb4d9..00000000000
--- a/arch/cris/arch-v32/lib/dram_init.S
+++ /dev/null
@@ -1,119 +0,0 @@
-/* $Id: dram_init.S,v 1.4 2005/04/24 18:48:32 starvik Exp $
- *
- * DRAM/SDRAM initialization - alter with care
- * This file is intended to be included from other assembler files
- *
- * Note: This file may not modify r8 or r9 because they are used to
- * carry information from the decompresser to the kernel
- *
- * Copyright (C) 2000-2003 Axis Communications AB
- *
- * Authors: Mikael Starvik (starvik@axis.com)
- */
-
-/* Just to be certain the config file is included, we include it here
- * explicitely instead of depending on it being included in the file that
- * uses this code.
- */
-
-#include <asm/arch/hwregs/asm/reg_map_asm.h>
-#include <asm/arch/hwregs/asm/bif_core_defs_asm.h>
-
- ;; WARNING! The registers r8 and r9 are used as parameters carrying
- ;; information from the decompressor (if the kernel was compressed).
- ;; They should not be used in the code below.
-
- ; Refer to BIF MDS for a description of SDRAM initialization
-
- ; Bank configuration
- move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp0), $r0
- move.d CONFIG_ETRAX_SDRAM_GRP0_CONFIG, $r1
- move.d $r1, [$r0]
- move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp1), $r0
- move.d CONFIG_ETRAX_SDRAM_GRP1_CONFIG, $r1
- move.d $r1, [$r0]
-
- ; Calculate value of mrs_data
- ; CAS latency = 2 && bus_width = 32 => 0x40
- ; CAS latency = 3 && bus_width = 32 => 0x60
- ; CAS latency = 2 && bus_width = 16 => 0x20
- ; CAS latency = 3 && bus_width = 16 => 0x30
-
- ; Check if value is already supplied in kernel config
- move.d CONFIG_ETRAX_SDRAM_COMMAND, $r2
- bne _set_timing
- nop
-
- move.d 0x40, $r4 ; Assume 32 bits and CAS latency = 2
- move.d CONFIG_ETRAX_SDRAM_TIMING, $r1
- and.d 0x07, $r1 ; Get CAS latency
- cmpq 2, $r1 ; CL = 2 ?
- beq _bw_check
- nop
- move.d 0x60, $r4
-
-_bw_check:
- ; Assume that group 0 width is equal to group 1. This assumption
- ; is wrong for a group 1 only hardware (such as the grand old
- ; StorPoint+).
- move.d CONFIG_ETRAX_SDRAM_GRP0_CONFIG, $r1
- and.d 0x200, $r1 ; DRAM width is bit 9
- beq _set_timing
- lslq 2, $r4 ; mrs_data starts at bit 2
- lsrq 1, $r4 ; 16 bits. Shift down value.
-
- ; Set timing parameters (refresh off to avoid Guinness TR 83)
-_set_timing:
- move.d CONFIG_ETRAX_SDRAM_TIMING, $r1
- and.d ~(3 << reg_bif_core_rw_sdram_timing___ref___lsb), $r1
- move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_timing), $r0
- move.d $r1, [$r0]
-
- ; Issue NOP command
- move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_cmd), $r5
- moveq regk_bif_core_nop, $r1
- move.d $r1, [$r5]
-
- ; Wait 200us
- move.d 10000, $r2
-1: bne 1b
- subq 1, $r2
-
- ; Issue initialization command sequence
- move.d _sdram_commands_start, $r2
- and.d 0x000fffff, $r2 ; Make sure commands are read from flash
- move.d _sdram_commands_end, $r3
- and.d 0x000fffff, $r3
-1: clear.d $r6
- move.b [$r2+], $r6 ; Load command
- or.d $r4, $r6 ; Add calculated mrs
- move.d $r6, [$r5] ; Write rw_sdram_cmd
- ; Wait 80 ns between each command
- move.d 4000, $r7
-2: bne 2b
- subq 1, $r7
- cmp.d $r2, $r3 ; Last command?
- bne 1b
- nop
-
- ; Start refresh
- move.d CONFIG_ETRAX_SDRAM_TIMING, $r1
- move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_timing), $r0
- move.d $r1, [$r0]
-
- ; Initialization finished
- ba _sdram_commands_end
- nop
-
-_sdram_commands_start:
- .byte regk_bif_core_pre ; Precharge
- .byte regk_bif_core_ref ; refresh
- .byte regk_bif_core_ref ; refresh
- .byte regk_bif_core_ref ; refresh
- .byte regk_bif_core_ref ; refresh
- .byte regk_bif_core_ref ; refresh
- .byte regk_bif_core_ref ; refresh
- .byte regk_bif_core_ref ; refresh
- .byte regk_bif_core_ref ; refresh
- .byte regk_bif_core_mrs ; mrs
-_sdram_commands_end:
diff --git a/arch/cris/arch-v32/lib/hw_settings.S b/arch/cris/arch-v32/lib/hw_settings.S
deleted file mode 100644
index fff9443513d..00000000000
--- a/arch/cris/arch-v32/lib/hw_settings.S
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * $Id: hw_settings.S,v 1.3 2005/04/24 18:36:57 starvik Exp $
- *
- * This table is used by some tools to extract hardware parameters.
- * The table should be included in the kernel and the decompressor.
- * Don't forget to update the tools if you change this table.
- *
- * Copyright (C) 2001 Axis Communications AB
- *
- * Authors: Mikael Starvik (starvik@axis.com)
- */
-
-#include <asm/arch/hwregs/asm/reg_map_asm.h>
-#include <asm/arch/hwregs/asm/bif_core_defs_asm.h>
-#include <asm/arch/hwregs/asm/gio_defs_asm.h>
-
- .ascii "HW_PARAM_MAGIC" ; Magic number
- .dword 0xc0004000 ; Kernel start address
-
- ; Debug port
-#ifdef CONFIG_ETRAX_DEBUG_PORT0
- .dword 0
-#elif defined(CONFIG_ETRAX_DEBUG_PORT1)
- .dword 1
-#elif defined(CONFIG_ETRAX_DEBUG_PORT2)
- .dword 2
-#elif defined(CONFIG_ETRAX_DEBUG_PORT3)
- .dword 3
-#else
- .dword 4 ; No debug
-#endif
-
- ; Register values
- .dword REG_ADDR(bif_core, regi_bif_core, rw_grp1_cfg)
- .dword CONFIG_ETRAX_MEM_GRP1_CONFIG
- .dword REG_ADDR(bif_core, regi_bif_core, rw_grp2_cfg)
- .dword CONFIG_ETRAX_MEM_GRP2_CONFIG
- .dword REG_ADDR(bif_core, regi_bif_core, rw_grp3_cfg)
- .dword CONFIG_ETRAX_MEM_GRP3_CONFIG
- .dword REG_ADDR(bif_core, regi_bif_core, rw_grp4_cfg)
- .dword CONFIG_ETRAX_MEM_GRP4_CONFIG
- .dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp0)
- .dword CONFIG_ETRAX_SDRAM_GRP0_CONFIG
- .dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp1)
- .dword CONFIG_ETRAX_SDRAM_GRP1_CONFIG
- .dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_timing)
- .dword CONFIG_ETRAX_SDRAM_TIMING
- .dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_cmd)
- .dword CONFIG_ETRAX_SDRAM_COMMAND
-
- .dword REG_ADDR(gio, regi_gio, rw_pa_dout)
- .dword CONFIG_ETRAX_DEF_GIO_PA_OUT
- .dword REG_ADDR(gio, regi_gio, rw_pa_oe)
- .dword CONFIG_ETRAX_DEF_GIO_PA_OE
- .dword REG_ADDR(gio, regi_gio, rw_pb_dout)
- .dword CONFIG_ETRAX_DEF_GIO_PB_OUT
- .dword REG_ADDR(gio, regi_gio, rw_pb_oe)
- .dword CONFIG_ETRAX_DEF_GIO_PB_OE
- .dword REG_ADDR(gio, regi_gio, rw_pc_dout)
- .dword CONFIG_ETRAX_DEF_GIO_PC_OUT
- .dword REG_ADDR(gio, regi_gio, rw_pc_oe)
- .dword CONFIG_ETRAX_DEF_GIO_PC_OE
- .dword REG_ADDR(gio, regi_gio, rw_pd_dout)
- .dword CONFIG_ETRAX_DEF_GIO_PD_OUT
- .dword REG_ADDR(gio, regi_gio, rw_pd_oe)
- .dword CONFIG_ETRAX_DEF_GIO_PD_OE
- .dword REG_ADDR(gio, regi_gio, rw_pe_dout)
- .dword CONFIG_ETRAX_DEF_GIO_PE_OUT
- .dword REG_ADDR(gio, regi_gio, rw_pe_oe)
- .dword CONFIG_ETRAX_DEF_GIO_PE_OE
-
- .dword 0 ; No more register values
diff --git a/arch/cris/arch-v32/lib/memset.c b/arch/cris/arch-v32/lib/memset.c
index ffca1214674..c94ea9b3ec2 100644
--- a/arch/cris/arch-v32/lib/memset.c
+++ b/arch/cris/arch-v32/lib/memset.c
@@ -1,253 +1,259 @@
-/*#************************************************************************#*/
-/*#-------------------------------------------------------------------------*/
-/*# */
-/*# FUNCTION NAME: memset() */
-/*# */
-/*# PARAMETERS: void* dst; Destination address. */
-/*# int c; Value of byte to write. */
-/*# int len; Number of bytes to write. */
-/*# */
-/*# RETURNS: dst. */
-/*# */
-/*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */
-/*# Framework taken from memcpy. This routine is */
-/*# very sensitive to compiler changes in register allocation. */
-/*# Should really be rewritten to avoid this problem. */
-/*# */
-/*#-------------------------------------------------------------------------*/
-/*# */
-/*# HISTORY */
-/*# */
-/*# DATE NAME CHANGES */
-/*# ---- ---- ------- */
-/*# 990713 HP Tired of watching this function (or */
-/*# really, the nonoptimized generic */
-/*# implementation) take up 90% of simulator */
-/*# output. Measurements needed. */
-/*# */
-/*#-------------------------------------------------------------------------*/
-
-#include <linux/types.h>
-
-/* No, there's no macro saying 12*4, since it is "hard" to get it into
- the asm in a good way. Thus better to expose the problem everywhere.
- */
-
-/* Assuming 1 cycle per dword written or read (ok, not really true), and
- one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1)
- so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */
-
-#define ZERO_BLOCK_SIZE (1*12*4)
-
-void *memset(void *pdst,
- int c,
- size_t plen)
+/* A memset for CRIS.
+ Copyright (C) 1999-2005 Axis Communications.
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Neither the name of Axis Communications nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
+ COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE. */
+
+/* FIXME: This file should really only be used for reference, as the
+ result is somewhat depending on gcc generating what we expect rather
+ than what we describe. An assembly file should be used instead. */
+
+/* Note the multiple occurrence of the expression "12*4", including the
+ asm. It is hard to get it into the asm in a good way. Thus better to
+ expose the problem everywhere: no macro. */
+
+/* Assuming one cycle per dword written or read (ok, not really true; the
+ world is not ideal), and one cycle per instruction, then 43+3*(n/48-1)
+ <= 24+24*(n/48-1) so n >= 45.7; n >= 0.9; we win on the first full
+ 48-byte block to set. */
+
+#define MEMSET_BY_BLOCK_THRESHOLD (1 * 48)
+
+/* No name ambiguities in this file. */
+__asm__ (".syntax no_register_prefix");
+
+void *memset(void *pdst, int c, unsigned int plen)
{
- /* Ok. Now we want the parameters put in special registers.
- Make sure the compiler is able to make something useful of this. */
+ /* Now we want the parameters in special registers. Make sure the
+ compiler does something usable with this. */
register char *return_dst __asm__ ("r10") = pdst;
register int n __asm__ ("r12") = plen;
register int lc __asm__ ("r11") = c;
- /* Most apps use memset sanely. Only those memsetting about 3..4
- bytes or less get penalized compared to the generic implementation
- - and that's not really sane use. */
+ /* Most apps use memset sanely. Memsetting about 3..4 bytes or less get
+ penalized here compared to the generic implementation. */
- /* Ugh. This is fragile at best. Check with newer GCC releases, if
- they compile cascaded "x |= x << 8" sanely! */
- __asm__("movu.b %0,$r13 \n\
- lslq 8,$r13 \n\
- move.b %0,$r13 \n\
- move.d $r13,%0 \n\
- lslq 16,$r13 \n\
- or.d $r13,%0"
- : "=r" (lc) : "0" (lc) : "r13");
+ /* This is fragile performancewise at best. Check with newer GCC
+ releases, if they compile cascaded "x |= x << 8" to sane code. */
+ __asm__("movu.b %0,r13 \n\
+ lslq 8,r13 \n\
+ move.b %0,r13 \n\
+ move.d r13,%0 \n\
+ lslq 16,r13 \n\
+ or.d r13,%0"
+ : "=r" (lc) /* Inputs. */
+ : "0" (lc) /* Outputs. */
+ : "r13"); /* Trash. */
{
register char *dst __asm__ ("r13") = pdst;
- /* This is NONPORTABLE, but since this whole routine is */
- /* grossly nonportable that doesn't matter. */
+ if (((unsigned long) pdst & 3) != 0
+ /* Oops! n = 0 must be a valid call, regardless of alignment. */
+ && n >= 3)
+ {
+ if ((unsigned long) dst & 1)
+ {
+ *dst = (char) lc;
+ n--;
+ dst++;
+ }
- if (((unsigned long) pdst & 3) != 0
- /* Oops! n=0 must be a legal call, regardless of alignment. */
- && n >= 3)
- {
- if ((unsigned long)dst & 1)
- {
- *dst = (char) lc;
- n--;
- dst++;
- }
-
- if ((unsigned long)dst & 2)
- {
- *(short *)dst = lc;
- n -= 2;
- dst += 2;
- }
- }
+ if ((unsigned long) dst & 2)
+ {
+ *(short *) dst = lc;
+ n -= 2;
+ dst += 2;
+ }
+ }
- /* Now the fun part. For the threshold value of this, check the equation
- above. */
- /* Decide which copying method to use. */
- if (n >= ZERO_BLOCK_SIZE)
- {
- /* For large copies we use 'movem' */
-
- /* It is not optimal to tell the compiler about clobbering any
- registers; that will move the saving/restoring of those registers
- to the function prologue/epilogue, and make non-movem sizes
- suboptimal.
-
- This method is not foolproof; it assumes that the "asm reg"
- declarations at the beginning of the function really are used
- here (beware: they may be moved to temporary registers).
- This way, we do not have to save/move the registers around into
- temporaries; we can safely use them straight away.
-
- If you want to check that the allocation was right; then
- check the equalities in the first comment. It should say
- "r13=r13, r12=r12, r11=r11" */
- __asm__ volatile (" \n\
- ;; Check that the register asm declaration got right. \n\
- ;; The GCC manual says it will work, but there *has* been bugs. \n\
- .ifnc %0-%1-%4,$r13-$r12-$r11 \n\
- .err \n\
- .endif \n\
+ /* Decide which setting method to use. */
+ if (n >= MEMSET_BY_BLOCK_THRESHOLD)
+ {
+ /* It is not optimal to tell the compiler about clobbering any
+ registers; that will move the saving/restoring of those registers
+ to the function prologue/epilogue, and make non-block sizes
+ suboptimal. */
+ __asm__ volatile
+ ("\
+ ;; GCC does promise correct register allocations, but let's \n\
+ ;; make sure it keeps its promises. \n\
+ .ifnc %0-%1-%4,$r13-$r12-$r11 \n\
+ .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\
+ .endif \n\
\n\
- ;; Save the registers we'll clobber in the movem process \n\
- ;; on the stack. Don't mention them to gcc, it will only be \n\
- ;; upset. \n\
- subq 11*4,$sp \n\
- movem $r10,[$sp] \n\
+ ;; Save the registers we'll clobber in the movem process \n\
+ ;; on the stack. Don't mention them to gcc, it will only be \n\
+ ;; upset. \n\
+ subq 11*4,sp \n\
+ movem r10,[sp] \n\
\n\
- move.d $r11,$r0 \n\
- move.d $r11,$r1 \n\
- move.d $r11,$r2 \n\
- move.d $r11,$r3 \n\
- move.d $r11,$r4 \n\
- move.d $r11,$r5 \n\
- move.d $r11,$r6 \n\
- move.d $r11,$r7 \n\
- move.d $r11,$r8 \n\
- move.d $r11,$r9 \n\
- move.d $r11,$r10 \n\
+ move.d r11,r0 \n\
+ move.d r11,r1 \n\
+ move.d r11,r2 \n\
+ move.d r11,r3 \n\
+ move.d r11,r4 \n\
+ move.d r11,r5 \n\
+ move.d r11,r6 \n\
+ move.d r11,r7 \n\
+ move.d r11,r8 \n\
+ move.d r11,r9 \n\
+ move.d r11,r10 \n\
\n\
- ;; Now we've got this: \n\
- ;; r13 - dst \n\
- ;; r12 - n \n\
+ ;; Now we've got this: \n\
+ ;; r13 - dst \n\
+ ;; r12 - n \n\
\n\
- ;; Update n for the first loop \n\
- subq 12*4,$r12 \n\
+ ;; Update n for the first loop \n\
+ subq 12*4,r12 \n\
0: \n\
- subq 12*4,$r12 \n\
- bge 0b \n\
- movem $r11,[$r13+] \n\
+"
+#ifdef __arch_common_v10_v32
+ /* Cater to branch offset difference between v32 and v10. We
+ assume the branch below has an 8-bit offset. */
+" setf\n"
+#endif
+" subq 12*4,r12 \n\
+ bge 0b \n\
+ movem r11,[r13+] \n\
\n\
- addq 12*4,$r12 ;; compensate for last loop underflowing n \n\
+ ;; Compensate for last loop underflowing n. \n\
+ addq 12*4,r12 \n\
\n\
- ;; Restore registers from stack \n\
- movem [$sp+],$r10"
+ ;; Restore registers from stack. \n\
+ movem [sp+],r10"
- /* Outputs */ : "=r" (dst), "=r" (n)
- /* Inputs */ : "0" (dst), "1" (n), "r" (lc));
- }
+ /* Outputs. */
+ : "=r" (dst), "=r" (n)
+
+ /* Inputs. */
+ : "0" (dst), "1" (n), "r" (lc));
+ }
+
+ /* An ad-hoc unroll, used for 4*12-1..16 bytes. */
+ while (n >= 16)
+ {
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ n -= 16;
+ }
- /* Either we directly starts copying, using dword copying
- in a loop, or we copy as much as possible with 'movem'
- and then the last block (<44 bytes) is copied here.
- This will work since 'movem' will have updated src,dst,n. */
-
- while ( n >= 16 )
- {
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- n -= 16;
- }
-
- /* A switch() is definitely the fastest although it takes a LOT of code.
- * Particularly if you inline code this.
- */
switch (n)
- {
+ {
case 0:
break;
+
case 1:
- *(char*)dst = (char) lc;
+ *dst = (char) lc;
break;
+
case 2:
- *(short*)dst = (short) lc;
+ *(short *) dst = (short) lc;
break;
+
case 3:
- *((short*)dst)++ = (short) lc;
- *(char*)dst = (char) lc;
+ *(short *) dst = (short) lc; dst += 2;
+ *dst = (char) lc;
break;
+
case 4:
- *((long*)dst)++ = lc;
+ *(long *) dst = lc;
break;
+
case 5:
- *((long*)dst)++ = lc;
- *(char*)dst = (char) lc;
+ *(long *) dst = lc; dst += 4;
+ *dst = (char) lc;
break;
+
case 6:
- *((long*)dst)++ = lc;
- *(short*)dst = (short) lc;
+ *(long *) dst = lc; dst += 4;
+ *(short *) dst = (short) lc;
break;
+
case 7:
- *((long*)dst)++ = lc;
- *((short*)dst)++ = (short) lc;
- *(char*)dst = (char) lc;
+ *(long *) dst = lc; dst += 4;
+ *(short *) dst = (short) lc; dst += 2;
+ *dst = (char) lc;
break;
+
case 8:
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc;
break;
+
case 9:
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- *(char*)dst = (char) lc;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ *dst = (char) lc;
break;
+
case 10:
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- *(short*)dst = (short) lc;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ *(short *) dst = (short) lc;
break;
+
case 11:
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- *((short*)dst)++ = (short) lc;
- *(char*)dst = (char) lc;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ *(short *) dst = (short) lc; dst += 2;
+ *dst = (char) lc;
break;
+
case 12:
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc;
break;
+
case 13:
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- *(char*)dst = (char) lc;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ *dst = (char) lc;
break;
+
case 14:
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- *(short*)dst = (short) lc;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ *(short *) dst = (short) lc;
break;
+
case 15:
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- *((long*)dst)++ = lc;
- *((short*)dst)++ = (short) lc;
- *(char*)dst = (char) lc;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ *(long *) dst = lc; dst += 4;
+ *(short *) dst = (short) lc; dst += 2;
+ *dst = (char) lc;
break;
- }
+ }
}
- return return_dst; /* destination pointer. */
-} /* memset() */
+ return return_dst;
+}
diff --git a/arch/cris/arch-v32/lib/nand_init.S b/arch/cris/arch-v32/lib/nand_init.S
deleted file mode 100644
index e019816facd..00000000000
--- a/arch/cris/arch-v32/lib/nand_init.S
+++ /dev/null
@@ -1,178 +0,0 @@
-##=============================================================================
-##
-## nand_init.S
-##
-## The bootrom copies data from the NAND flash to the internal RAM but
-## due to a bug/feature we can only trust the 256 first bytes. So this
-## code copies more data from NAND flash to internal RAM. Obvioulsy this
-## code must fit in the first 256 bytes so alter with care.
-##
-## Some notes about the bug/feature for future reference:
-## The bootrom copies the first 127 KB from NAND flash to internal
-## memory. The problem is that it does a bytewise copy. NAND flashes
-## does autoincrement on the address so for a 16-bite device each
-## read/write increases the address by two. So the copy loop in the
-## bootrom will discard every second byte. This is solved by inserting
-## zeroes in every second byte in the first erase block.
-##
-## The bootrom also incorrectly assumes that it can read the flash
-## linear with only one read command but the flash will actually
-## switch between normal area and spare area if you do that so we
-## can't trust more than the first 256 bytes.
-##
-##=============================================================================
-
-#include <asm/arch/hwregs/asm/reg_map_asm.h>
-#include <asm/arch/hwregs/asm/gio_defs_asm.h>
-#include <asm/arch/hwregs/asm/pinmux_defs_asm.h>
-#include <asm/arch/hwregs/asm/bif_core_defs_asm.h>
-#include <asm/arch/hwregs/asm/config_defs_asm.h>
-
-;; There are 8-bit NAND flashes and 16-bit NAND flashes.
-;; We need to treat them slightly different.
-#if CONFIG_ETRAX_FLASH_BUSWIDTH==2
-#define PAGE_SIZE 256
-#else
-#error 2
-#define PAGE_SIZE 512
-#endif
-#define ERASE_BLOCK 16384
-
-;; GPIO pins connected to NAND flash
-#define CE 4
-#define CLE 5
-#define ALE 6
-#define BY 7
-
-;; Address space for NAND flash
-#define NAND_RD_ADDR 0x90000000
-#define NAND_WR_ADDR 0x94000000
-
-#define READ_CMD 0x00
-
-;; Readability macros
-#define CSP_MASK \
- REG_MASK(bif_core, rw_grp3_cfg, gated_csp0) | \
- REG_MASK(bif_core, rw_grp3_cfg, gated_csp1)
-#define CSP_VAL \
- REG_STATE(bif_core, rw_grp3_cfg, gated_csp0, rd) | \
- REG_STATE(bif_core, rw_grp3_cfg, gated_csp1, wr)
-
-;;----------------------------------------------------------------------------
-;; Macros to set/clear GPIO bits
-
-.macro SET x
- or.b (1<<\x),$r9
- move.d $r9, [$r2]
-.endm
-
-.macro CLR x
- and.b ~(1<<\x),$r9
- move.d $r9, [$r2]
-.endm
-
-;;----------------------------------------------------------------------------
-
-nand_boot:
- ;; Check if nand boot was selected
- move.d REG_ADDR(config, regi_config, r_bootsel), $r0
- move.d [$r0], $r0
- and.d REG_MASK(config, r_bootsel, boot_mode), $r0
- cmp.d REG_STATE(config, r_bootsel, boot_mode, nand), $r0
- bne normal_boot ; No NAND boot
- nop
-
-copy_nand_to_ram:
- ;; copy_nand_to_ram
- ;; Arguments
- ;; r10 - destination
- ;; r11 - source offset
- ;; r12 - size
- ;; r13 - Address to jump to after completion
- ;; Note : r10-r12 are clobbered on return
- ;; Registers used:
- ;; r0 - NAND_RD_ADDR
- ;; r1 - NAND_WR_ADDR
- ;; r2 - reg_gio_rw_pa_dout
- ;; r3 - reg_gio_r_pa_din
- ;; r4 - tmp
- ;; r5 - byte counter within a page
- ;; r6 - reg_pinmux_rw_pa
- ;; r7 - reg_gio_rw_pa_oe
- ;; r8 - reg_bif_core_rw_grp3_cfg
- ;; r9 - reg_gio_rw_pa_dout shadow
- move.d 0x90000000, $r0
- move.d 0x94000000, $r1
- move.d REG_ADDR(gio, regi_gio, rw_pa_dout), $r2
- move.d REG_ADDR(gio, regi_gio, r_pa_din), $r3
- move.d REG_ADDR(pinmux, regi_pinmux, rw_pa), $r6
- move.d REG_ADDR(gio, regi_gio, rw_pa_oe), $r7
- move.d REG_ADDR(bif_core, regi_bif_core, rw_grp3_cfg), $r8
-
-#if CONFIG_ETRAX_FLASH_BUSWIDTH==2
- lsrq 1, $r11
-#endif
- ;; Set up GPIO
- move.d [$r2], $r9
- move.d [$r7], $r4
- or.b (1<<ALE) | (1 << CLE) | (1<<CE), $r4
- move.d $r4, [$r7]
-
- ;; Set up bif
- move.d [$r8], $r4
- and.d CSP_MASK, $r4
- or.d CSP_VAL, $r4
- move.d $r4, [$r8]
-
-1: ;; Copy one page
- CLR CE
- SET CLE
- moveq READ_CMD, $r4
- move.b $r4, [$r1]
- moveq 20, $r4
-2: bne 2b
- subq 1, $r4
- CLR CLE
- SET ALE
- clear.w [$r1] ; Column address = 0
- move.d $r11, $r4
- lsrq 8, $r4
- move.b $r4, [$r1] ; Row address
- lsrq 8, $r4
- move.b $r4, [$r1] ; Row adddress
- moveq 20, $r4
-2: bne 2b
- subq 1, $r4
- CLR ALE
-2: move.d [$r3], $r4
- and.d 1 << BY, $r4
- beq 2b
- movu.w PAGE_SIZE, $r5
-2: ; Copy one byte/word
-#if CONFIG_ETRAX_FLASH_BUSWIDTH==2
- move.w [$r0], $r4
-#else
- move.b [$r0], $r4
-#endif
- subq 1, $r5
- bne 2b
-#if CONFIG_ETRAX_FLASH_BUSWIDTH==2
- move.w $r4, [$r10+]
- subu.w PAGE_SIZE*2, $r12
-#else
- move.b $r4, [$r10+]
- subu.w PAGE_SIZE, $r12
-#endif
- bpl 1b
- addu.w PAGE_SIZE, $r11
-
- ;; End of copy
- jump $r13
- nop
-
- ;; This will warn if the code above is too large. If you consider
- ;; to remove this you don't understand the bug/feature.
- .org 256
- .org ERASE_BLOCK
-
-normal_boot:
diff --git a/arch/cris/arch-v32/lib/spinlock.S b/arch/cris/arch-v32/lib/spinlock.S
index 2437ae7f6ed..fe610b9d775 100644
--- a/arch/cris/arch-v32/lib/spinlock.S
+++ b/arch/cris/arch-v32/lib/spinlock.S
@@ -6,28 +6,35 @@
.global cris_spin_lock
+ .type cris_spin_lock,@function
.global cris_spin_trylock
+ .type cris_spin_trylock,@function
.text
cris_spin_lock:
clearf p
-1: test.d [$r10]
+1: test.b [$r10]
beq 1b
clearf p
ax
- clear.d [$r10]
+ clear.b [$r10]
bcs 1b
clearf p
ret
nop
+ .size cris_spin_lock, . - cris_spin_lock
+
cris_spin_trylock:
clearf p
-1: move.d [$r10], $r11
+1: move.b [$r10], $r11
ax
- clear.d [$r10]
+ clear.b [$r10]
bcs 1b
clearf p
ret
- move.d $r11,$r10
+ movu.b $r11,$r10
+
+ .size cris_spin_trylock, . - cris_spin_trylock
+
diff --git a/arch/cris/arch-v32/lib/strcmp.S b/arch/cris/arch-v32/lib/strcmp.S
new file mode 100644
index 00000000000..8f7a1ee6259
--- /dev/null
+++ b/arch/cris/arch-v32/lib/strcmp.S
@@ -0,0 +1,21 @@
+; strcmp.S -- CRISv32 version.
+; Copyright (C) 2008 AXIS Communications AB
+; Written by Edgar E. Iglesias
+;
+; This source code is licensed under the GNU General Public License,
+; Version 2. See the file COPYING for more details.
+
+ .global strcmp
+ .type strcmp,@function
+strcmp:
+1:
+ move.b [$r10+], $r12
+ seq $r13
+ sub.b [$r11+], $r12
+ or.b $r12, $r13
+ beq 1b
+ nop
+
+ ret
+ movs.b $r12, $r10
+ .size strcmp, . - strcmp
diff --git a/arch/cris/arch-v32/lib/string.c b/arch/cris/arch-v32/lib/string.c
index 98e282ac824..c7bd6ebdc93 100644
--- a/arch/cris/arch-v32/lib/string.c
+++ b/arch/cris/arch-v32/lib/string.c
@@ -1,55 +1,59 @@
-/*#************************************************************************#*/
-/*#-------------------------------------------------------------------------*/
-/*# */
-/*# FUNCTION NAME: memcpy() */
-/*# */
-/*# PARAMETERS: void* dst; Destination address. */
-/*# void* src; Source address. */
-/*# int len; Number of bytes to copy. */
-/*# */
-/*# RETURNS: dst. */
-/*# */
-/*# DESCRIPTION: Copies len bytes of memory from src to dst. No guarantees */
-/*# about copying of overlapping memory areas. This routine is */
-/*# very sensitive to compiler changes in register allocation. */
-/*# Should really be rewritten to avoid this problem. */
-/*# */
-/*#-------------------------------------------------------------------------*/
-/*# */
-/*# HISTORY */
-/*# */
-/*# DATE NAME CHANGES */
-/*# ---- ---- ------- */
-/*# 941007 Kenny R Creation */
-/*# 941011 Kenny R Lots of optimizations and inlining. */
-/*# 941129 Ulf A Adapted for use in libc. */
-/*# 950216 HP N==0 forgotten if non-aligned src/dst. */
-/*# Added some optimizations. */
-/*# 001025 HP Make src and dst char *. Align dst to */
-/*# dword, not just word-if-both-src-and-dst- */
-/*# are-misaligned. */
-/*# */
-/*#-------------------------------------------------------------------------*/
-
-#include <linux/types.h>
-
-void *memcpy(void *pdst,
- const void *psrc,
- size_t pn)
+/* A memcpy for CRIS.
+ Copyright (C) 1994-2005 Axis Communications.
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Neither the name of Axis Communications nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
+ COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE. */
+
+/* FIXME: This file should really only be used for reference, as the
+ result is somewhat depending on gcc generating what we expect rather
+ than what we describe. An assembly file should be used instead. */
+
+#include <stddef.h>
+
+/* Break even between movem and move16 is really at 38.7 * 2, but
+ modulo 44, so up to the next multiple of 44, we use ordinary code. */
+#define MEMCPY_BY_BLOCK_THRESHOLD (44 * 2)
+
+/* No name ambiguities in this file. */
+__asm__ (".syntax no_register_prefix");
+
+void *
+memcpy(void *pdst, const void *psrc, size_t pn)
{
- /* Ok. Now we want the parameters put in special registers.
+ /* Now we want the parameters put in special registers.
Make sure the compiler is able to make something useful of this.
- As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
+ As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
If gcc was allright, it really would need no temporaries, and no
- stack space to save stuff on. */
+ stack space to save stuff on. */
register void *return_dst __asm__ ("r10") = pdst;
- register char *dst __asm__ ("r13") = pdst;
- register const char *src __asm__ ("r11") = psrc;
+ register unsigned char *dst __asm__ ("r13") = pdst;
+ register unsigned const char *src __asm__ ("r11") = psrc;
register int n __asm__ ("r12") = pn;
-
/* When src is aligned but not dst, this makes a few extra needless
cycles. I believe it would take as many to check that the
re-alignment was unnecessary. */
@@ -59,161 +63,174 @@ void *memcpy(void *pdst,
&& n >= 3)
{
if ((unsigned long) dst & 1)
- {
- n--;
- *(char*)dst = *(char*)src;
- src++;
- dst++;
- }
+ {
+ n--;
+ *dst = *src;
+ src++;
+ dst++;
+ }
if ((unsigned long) dst & 2)
- {
- n -= 2;
- *(short*)dst = *(short*)src;
- src += 2;
- dst += 2;
- }
+ {
+ n -= 2;
+ *(short *) dst = *(short *) src;
+ src += 2;
+ dst += 2;
+ }
}
- /* Decide which copying method to use. Movem is dirt cheap, so the
- overheap is low enough to always use the minimum block size as the
- threshold. */
- if (n >= 44)
- {
- /* For large copies we use 'movem' */
-
- /* It is not optimal to tell the compiler about clobbering any
- registers; that will move the saving/restoring of those registers
- to the function prologue/epilogue, and make non-movem sizes
- suboptimal. */
- __asm__ volatile (" \n\
- ;; Check that the register asm declaration got right. \n\
- ;; The GCC manual explicitly says TRT will happen. \n\
- .ifnc %0-%1-%2,$r13-$r11-$r12 \n\
- .err \n\
- .endif \n\
- \n\
- ;; Save the registers we'll use in the movem process \n\
+ /* Decide which copying method to use. */
+ if (n >= MEMCPY_BY_BLOCK_THRESHOLD)
+ {
+ /* It is not optimal to tell the compiler about clobbering any
+ registers; that will move the saving/restoring of those registers
+ to the function prologue/epilogue, and make non-movem sizes
+ suboptimal. */
+ __asm__ volatile
+ ("\
+ ;; GCC does promise correct register allocations, but let's \n\
+ ;; make sure it keeps its promises. \n\
+ .ifnc %0-%1-%2,$r13-$r11-$r12 \n\
+ .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\
+ .endif \n\
\n\
- ;; on the stack. \n\
- subq 11*4,$sp \n\
- movem $r10,[$sp] \n\
+ ;; Save the registers we'll use in the movem process \n\
+ ;; on the stack. \n\
+ subq 11*4,sp \n\
+ movem r10,[sp] \n\
\n\
- ;; Now we've got this: \n\
- ;; r11 - src \n\
- ;; r13 - dst \n\
- ;; r12 - n \n\
+ ;; Now we've got this: \n\
+ ;; r11 - src \n\
+ ;; r13 - dst \n\
+ ;; r12 - n \n\
\n\
- ;; Update n for the first loop \n\
- subq 44,$r12 \n\
+ ;; Update n for the first loop. \n\
+ subq 44,r12 \n\
0: \n\
- movem [$r11+],$r10 \n\
- subq 44,$r12 \n\
- bge 0b \n\
- movem $r10,[$r13+] \n\
+"
+#ifdef __arch_common_v10_v32
+ /* Cater to branch offset difference between v32 and v10. We
+ assume the branch below has an 8-bit offset. */
+" setf\n"
+#endif
+" movem [r11+],r10 \n\
+ subq 44,r12 \n\
+ bge 0b \n\
+ movem r10,[r13+] \n\
\n\
- addq 44,$r12 ;; compensate for last loop underflowing n \n\
+ ;; Compensate for last loop underflowing n. \n\
+ addq 44,r12 \n\
\n\
- ;; Restore registers from stack \n\
- movem [$sp+],$r10"
+ ;; Restore registers from stack. \n\
+ movem [sp+],r10"
- /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n)
- /* Inputs */ : "0" (dst), "1" (src), "2" (n));
+ /* Outputs. */
+ : "=r" (dst), "=r" (src), "=r" (n)
- }
+ /* Inputs. */
+ : "0" (dst), "1" (src), "2" (n));
+ }
- /* Either we directly starts copying, using dword copying
- in a loop, or we copy as much as possible with 'movem'
- and then the last block (<44 bytes) is copied here.
- This will work since 'movem' will have updated src,dst,n. */
+ while (n >= 16)
+ {
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
- while ( n >= 16 )
- {
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- n -= 16;
- }
+ n -= 16;
+ }
- /* A switch() is definitely the fastest although it takes a LOT of code.
- * Particularly if you inline code this.
- */
switch (n)
- {
+ {
case 0:
break;
+
case 1:
- *(char*)dst = *(char*)src;
+ *dst = *src;
break;
+
case 2:
- *(short*)dst = *(short*)src;
+ *(short *) dst = *(short *) src;
break;
+
case 3:
- *((short*)dst)++ = *((short*)src)++;
- *(char*)dst = *(char*)src;
+ *(short *) dst = *(short *) src; dst += 2; src += 2;
+ *dst = *src;
break;
+
case 4:
- *((long*)dst)++ = *((long*)src)++;
+ *(long *) dst = *(long *) src;
break;
+
case 5:
- *((long*)dst)++ = *((long*)src)++;
- *(char*)dst = *(char*)src;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *dst = *src;
break;
+
case 6:
- *((long*)dst)++ = *((long*)src)++;
- *(short*)dst = *(short*)src;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(short *) dst = *(short *) src;
break;
+
case 7:
- *((long*)dst)++ = *((long*)src)++;
- *((short*)dst)++ = *((short*)src)++;
- *(char*)dst = *(char*)src;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(short *) dst = *(short *) src; dst += 2; src += 2;
+ *dst = *src;
break;
+
case 8:
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src;
break;
+
case 9:
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- *(char*)dst = *(char*)src;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *dst = *src;
break;
+
case 10:
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- *(short*)dst = *(short*)src;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(short *) dst = *(short *) src;
break;
+
case 11:
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- *((short*)dst)++ = *((short*)src)++;
- *(char*)dst = *(char*)src;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(short *) dst = *(short *) src; dst += 2; src += 2;
+ *dst = *src;
break;
+
case 12:
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src;
break;
+
case 13:
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- *(char*)dst = *(char*)src;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *dst = *src;
break;
+
case 14:
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- *(short*)dst = *(short*)src;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(short *) dst = *(short *) src;
break;
+
case 15:
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- *((long*)dst)++ = *((long*)src)++;
- *((short*)dst)++ = *((short*)src)++;
- *(char*)dst = *(char*)src;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(long *) dst = *(long *) src; dst += 4; src += 4;
+ *(short *) dst = *(short *) src; dst += 2; src += 2;
+ *dst = *src;
break;
- }
+ }
- return return_dst; /* destination pointer. */
-} /* memcpy() */
+ return return_dst;
+}
diff --git a/arch/cris/arch-v32/lib/usercopy.c b/arch/cris/arch-v32/lib/usercopy.c
index f0b08460c1b..0b5b70d5f58 100644
--- a/arch/cris/arch-v32/lib/usercopy.c
+++ b/arch/cris/arch-v32/lib/usercopy.c
@@ -34,7 +34,7 @@ __copy_user (void __user *pdst, const void *psrc, unsigned long pn)
As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
FIXME: Comment for old gcc version. Check.
- If gcc was allright, it really would need no temporaries, and no
+ If gcc was alright, it really would need no temporaries, and no
stack space to save stuff on. */
register char *dst __asm__ ("r13") = pdst;
@@ -161,14 +161,14 @@ __copy_user (void __user *pdst, const void *psrc, unsigned long pn)
inaccessible. */
unsigned long
-__copy_user_zeroing (void __user *pdst, const void *psrc, unsigned long pn)
+__copy_user_zeroing(void *pdst, const void __user *psrc, unsigned long pn)
{
/* We want the parameters put in special registers.
Make sure the compiler is able to make something useful of this.
As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
FIXME: Comment for old gcc version. Check.
- If gcc was allright, it really would need no temporaries, and no
+ If gcc was alright, it really would need no temporaries, and no
stack space to save stuff on. */
register char *dst __asm__ ("r13") = pdst;
@@ -332,7 +332,7 @@ __do_clear_user (void __user *pto, unsigned long pn)
As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
FIXME: Comment for old gcc version. Check.
- If gcc was allright, it really would need no temporaries, and no
+ If gcc was alright, it really would need no temporaries, and no
stack space to save stuff on. */
register char *dst __asm__ ("r13") = pto;