diff options
Diffstat (limited to 'arch/cris/arch-v32/lib')
| -rw-r--r-- | arch/cris/arch-v32/lib/Makefile | 3 | ||||
| -rw-r--r-- | arch/cris/arch-v32/lib/checksum.S | 75 | ||||
| -rw-r--r-- | arch/cris/arch-v32/lib/checksumcopy.S | 72 | ||||
| -rw-r--r-- | arch/cris/arch-v32/lib/delay.c | 28 | ||||
| -rw-r--r-- | arch/cris/arch-v32/lib/dram_init.S | 119 | ||||
| -rw-r--r-- | arch/cris/arch-v32/lib/hw_settings.S | 72 | ||||
| -rw-r--r-- | arch/cris/arch-v32/lib/memset.c | 398 | ||||
| -rw-r--r-- | arch/cris/arch-v32/lib/nand_init.S | 178 | ||||
| -rw-r--r-- | arch/cris/arch-v32/lib/spinlock.S | 17 | ||||
| -rw-r--r-- | arch/cris/arch-v32/lib/strcmp.S | 21 | ||||
| -rw-r--r-- | arch/cris/arch-v32/lib/string.c | 323 | ||||
| -rw-r--r-- | arch/cris/arch-v32/lib/usercopy.c | 8 |
12 files changed, 488 insertions, 826 deletions
diff --git a/arch/cris/arch-v32/lib/Makefile b/arch/cris/arch-v32/lib/Makefile index 05b3ec6978d..dd296b9db03 100644 --- a/arch/cris/arch-v32/lib/Makefile +++ b/arch/cris/arch-v32/lib/Makefile @@ -2,5 +2,6 @@ # Makefile for Etrax-specific library files.. # -lib-y = checksum.o checksumcopy.o string.o usercopy.o memset.o csumcpfruser.o spinlock.o +lib-y = checksum.o checksumcopy.o string.o usercopy.o memset.o \ + csumcpfruser.o spinlock.o delay.o strcmp.o diff --git a/arch/cris/arch-v32/lib/checksum.S b/arch/cris/arch-v32/lib/checksum.S index 32e66181b82..4a72a94a49a 100644 --- a/arch/cris/arch-v32/lib/checksum.S +++ b/arch/cris/arch-v32/lib/checksum.S @@ -1,41 +1,35 @@ /* * A fast checksum routine using movem - * Copyright (c) 1998-2001, 2003 Axis Communications AB + * Copyright (c) 1998-2007 Axis Communications AB * * csum_partial(const unsigned char * buff, int len, unsigned int sum) */ .globl csum_partial + .type csum_partial,@function csum_partial: ;; r10 - src ;; r11 - length ;; r12 - checksum - ;; check for breakeven length between movem and normal word looping versions - ;; we also do _NOT_ want to compute a checksum over more than the - ;; actual length when length < 40 - - cmpu.w 80,$r11 - blo _word_loop - nop - - ;; need to save the registers we use below in the movem loop - ;; this overhead is why we have a check above for breakeven length - ;; only r0 - r8 have to be saved, the other ones are clobber-able - ;; according to the ABI + ;; Optimized for large packets + subq 10*4, $r11 + blt _word_loop + move.d $r11, $acr subq 9*4,$sp - subq 10*4,$r11 ; update length for the first loop + clearf c movem $r8,[$sp] ;; do a movem checksum _mloop: movem [$r10+],$r9 ; read 10 longwords - + ;; Loop count without touching the c flag. + addoq -10*4, $acr, $acr ;; perform dword checksumming on the 10 longwords - add.d $r0,$r12 + addc $r0,$r12 addc $r1,$r12 addc $r2,$r12 addc $r3,$r12 @@ -46,60 +40,41 @@ _mloop: movem [$r10+],$r9 ; read 10 longwords addc $r8,$r12 addc $r9,$r12 - ;; fold the carry into the checksum, to avoid having to loop the carry - ;; back into the top - - addc 0,$r12 - addc 0,$r12 ; do it again, since we might have generated a carry - - subq 10*4,$r11 - bge _mloop - nop - - addq 10*4,$r11 ; compensate for last loop underflowing length + ;; test $acr without trashing carry. + move.d $acr, $acr + bpl _mloop + ;; r11 <= acr is not really needed in the mloop, just using the dslot + ;; to prepare for what is needed after mloop. + move.d $acr, $r11 + ;; fold the last carry into r13 + addc 0, $r12 movem [$sp+],$r8 ; restore regs _word_loop: - ;; only fold if there is anything to fold. - - cmpq 0,$r12 - beq _no_fold - - ;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below. - ;; r9 and r13 can be used as temporaries. + addq 10*4,$r11 ; compensate for last loop underflowing length moveq -1,$r9 ; put 0xffff in r9, faster than move.d 0xffff,r9 lsrq 16,$r9 move.d $r12,$r13 lsrq 16,$r13 ; r13 = checksum >> 16 - and.d $r9,$r12 ; checksum = checksum & 0xffff - add.d $r13,$r12 ; checksum += r13 - move.d $r12,$r13 ; do the same again, maybe we got a carry last add - lsrq 16,$r13 - and.d $r9,$r12 - add.d $r13,$r12 + and.d $r9,$r12 ; checksum = checksum & 0xffff _no_fold: - cmpq 2,$r11 + subq 2,$r11 blt _no_words - nop + add.d $r13,$r12 ; checksum += r13 ;; checksum the rest of the words - - subq 2,$r11 - _wloop: subq 2,$r11 bge _wloop addu.w [$r10+],$r12 - addq 2,$r11 - _no_words: + addq 2,$r11 ;; see if we have one odd byte more - cmpq 1,$r11 - beq _do_byte + bne _do_byte nop ret move.d $r12,$r10 @@ -109,3 +84,5 @@ _do_byte: addu.b [$r10],$r12 ret move.d $r12,$r10 + + .size csum_partial, .-csum_partial diff --git a/arch/cris/arch-v32/lib/checksumcopy.S b/arch/cris/arch-v32/lib/checksumcopy.S index 9303ccbadc6..54e209f18b0 100644 --- a/arch/cris/arch-v32/lib/checksumcopy.S +++ b/arch/cris/arch-v32/lib/checksumcopy.S @@ -1,6 +1,6 @@ /* * A fast checksum+copy routine using movem - * Copyright (c) 1998, 2001, 2003 Axis Communications AB + * Copyright (c) 1998-2007 Axis Communications AB * * Authors: Bjorn Wesen * @@ -9,6 +9,7 @@ */ .globl csum_partial_copy_nocheck + .type csum_partial_copy_nocheck,@function csum_partial_copy_nocheck: ;; r10 - src @@ -16,32 +17,23 @@ csum_partial_copy_nocheck: ;; r12 - length ;; r13 - checksum - ;; check for breakeven length between movem and normal word looping versions - ;; we also do _NOT_ want to compute a checksum over more than the - ;; actual length when length < 40 - - cmpu.w 80,$r12 - blo _word_loop - nop - - ;; need to save the registers we use below in the movem loop - ;; this overhead is why we have a check above for breakeven length - ;; only r0 - r8 have to be saved, the other ones are clobber-able - ;; according to the ABI + ;; Optimized for large packets + subq 10*4, $r12 + blt _word_loop + move.d $r12, $acr subq 9*4,$sp - subq 10*4,$r12 ; update length for the first loop + clearf c movem $r8,[$sp] ;; do a movem copy and checksum - 1: ;; A failing userspace access (the read) will have this as PC. _mloop: movem [$r10+],$r9 ; read 10 longwords + addoq -10*4, $acr, $acr ; loop counter in latency cycle movem $r9,[$r11+] ; write 10 longwords ;; perform dword checksumming on the 10 longwords - - add.d $r0,$r13 + addc $r0,$r13 addc $r1,$r13 addc $r2,$r13 addc $r3,$r13 @@ -52,47 +44,30 @@ _mloop: movem [$r10+],$r9 ; read 10 longwords addc $r8,$r13 addc $r9,$r13 - ;; fold the carry into the checksum, to avoid having to loop the carry - ;; back into the top - - addc 0,$r13 - addc 0,$r13 ; do it again, since we might have generated a carry - - subq 10*4,$r12 - bge _mloop - nop - - addq 10*4,$r12 ; compensate for last loop underflowing length + ;; test $acr, without trashing carry. + move.d $acr, $acr + bpl _mloop + ;; r12 <= acr is needed after mloop and in the exception handlers. + move.d $acr, $r12 + ;; fold the last carry into r13 + addc 0, $r13 movem [$sp+],$r8 ; restore regs _word_loop: - ;; only fold if there is anything to fold. - - cmpq 0,$r13 - beq _no_fold + addq 10*4,$r12 ; compensate for last loop underflowing length ;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below ;; r9 can be used as temporary. - move.d $r13,$r9 lsrq 16,$r9 ; r0 = checksum >> 16 and.d 0xffff,$r13 ; checksum = checksum & 0xffff - add.d $r9,$r13 ; checksum += r0 - move.d $r13,$r9 ; do the same again, maybe we got a carry last add - lsrq 16,$r9 - and.d 0xffff,$r13 - add.d $r9,$r13 -_no_fold: - cmpq 2,$r12 + subq 2, $r12 blt _no_words - nop + add.d $r9,$r13 ; checksum += r0 ;; copy and checksum the rest of the words - - subq 2,$r12 - 2: ;; A failing userspace access for the read below will have this as PC. _wloop: move.w [$r10+],$r9 addu.w $r9,$r13 @@ -100,12 +75,9 @@ _wloop: move.w [$r10+],$r9 bge _wloop move.w $r9,[$r11+] - addq 2,$r12 - _no_words: - ;; see if we have one odd byte more - cmpq 1,$r12 - beq _do_byte + addq 2,$r12 + bne _do_byte nop ret move.d $r13,$r10 @@ -118,3 +90,5 @@ _do_byte: move.b $r9,[$r11] ret move.d $r13,$r10 + + .size csum_partial_copy_nocheck, . - csum_partial_copy_nocheck diff --git a/arch/cris/arch-v32/lib/delay.c b/arch/cris/arch-v32/lib/delay.c new file mode 100644 index 00000000000..39f1ac9995b --- /dev/null +++ b/arch/cris/arch-v32/lib/delay.c @@ -0,0 +1,28 @@ +/* + * Precise Delay Loops for ETRAX FS + * + * Copyright (C) 2006 Axis Communications AB. + * + */ + +#include <hwregs/reg_map.h> +#include <hwregs/reg_rdwr.h> +#include <hwregs/timer_defs.h> +#include <linux/types.h> +#include <linux/delay.h> +#include <linux/module.h> + +/* + * On ETRAX FS, we can check the free-running read-only 100MHz timer + * getting 32-bit 10ns precision, theoretically good for 42.94967295 + * seconds. Unsigned arithmetic and careful expression handles + * wrapping. + */ + +void cris_delay10ns(u32 n10ns) +{ + u32 t0 = REG_RD(timer, regi_timer0, r_time); + while (REG_RD(timer, regi_timer0, r_time) - t0 < n10ns) + ; +} +EXPORT_SYMBOL(cris_delay10ns); diff --git a/arch/cris/arch-v32/lib/dram_init.S b/arch/cris/arch-v32/lib/dram_init.S deleted file mode 100644 index 158b3dbb4d9..00000000000 --- a/arch/cris/arch-v32/lib/dram_init.S +++ /dev/null @@ -1,119 +0,0 @@ -/* $Id: dram_init.S,v 1.4 2005/04/24 18:48:32 starvik Exp $ - * - * DRAM/SDRAM initialization - alter with care - * This file is intended to be included from other assembler files - * - * Note: This file may not modify r8 or r9 because they are used to - * carry information from the decompresser to the kernel - * - * Copyright (C) 2000-2003 Axis Communications AB - * - * Authors: Mikael Starvik (starvik@axis.com) - */ - -/* Just to be certain the config file is included, we include it here - * explicitely instead of depending on it being included in the file that - * uses this code. - */ - -#include <asm/arch/hwregs/asm/reg_map_asm.h> -#include <asm/arch/hwregs/asm/bif_core_defs_asm.h> - - ;; WARNING! The registers r8 and r9 are used as parameters carrying - ;; information from the decompressor (if the kernel was compressed). - ;; They should not be used in the code below. - - ; Refer to BIF MDS for a description of SDRAM initialization - - ; Bank configuration - move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp0), $r0 - move.d CONFIG_ETRAX_SDRAM_GRP0_CONFIG, $r1 - move.d $r1, [$r0] - move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp1), $r0 - move.d CONFIG_ETRAX_SDRAM_GRP1_CONFIG, $r1 - move.d $r1, [$r0] - - ; Calculate value of mrs_data - ; CAS latency = 2 && bus_width = 32 => 0x40 - ; CAS latency = 3 && bus_width = 32 => 0x60 - ; CAS latency = 2 && bus_width = 16 => 0x20 - ; CAS latency = 3 && bus_width = 16 => 0x30 - - ; Check if value is already supplied in kernel config - move.d CONFIG_ETRAX_SDRAM_COMMAND, $r2 - bne _set_timing - nop - - move.d 0x40, $r4 ; Assume 32 bits and CAS latency = 2 - move.d CONFIG_ETRAX_SDRAM_TIMING, $r1 - and.d 0x07, $r1 ; Get CAS latency - cmpq 2, $r1 ; CL = 2 ? - beq _bw_check - nop - move.d 0x60, $r4 - -_bw_check: - ; Assume that group 0 width is equal to group 1. This assumption - ; is wrong for a group 1 only hardware (such as the grand old - ; StorPoint+). - move.d CONFIG_ETRAX_SDRAM_GRP0_CONFIG, $r1 - and.d 0x200, $r1 ; DRAM width is bit 9 - beq _set_timing - lslq 2, $r4 ; mrs_data starts at bit 2 - lsrq 1, $r4 ; 16 bits. Shift down value. - - ; Set timing parameters (refresh off to avoid Guinness TR 83) -_set_timing: - move.d CONFIG_ETRAX_SDRAM_TIMING, $r1 - and.d ~(3 << reg_bif_core_rw_sdram_timing___ref___lsb), $r1 - move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_timing), $r0 - move.d $r1, [$r0] - - ; Issue NOP command - move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_cmd), $r5 - moveq regk_bif_core_nop, $r1 - move.d $r1, [$r5] - - ; Wait 200us - move.d 10000, $r2 -1: bne 1b - subq 1, $r2 - - ; Issue initialization command sequence - move.d _sdram_commands_start, $r2 - and.d 0x000fffff, $r2 ; Make sure commands are read from flash - move.d _sdram_commands_end, $r3 - and.d 0x000fffff, $r3 -1: clear.d $r6 - move.b [$r2+], $r6 ; Load command - or.d $r4, $r6 ; Add calculated mrs - move.d $r6, [$r5] ; Write rw_sdram_cmd - ; Wait 80 ns between each command - move.d 4000, $r7 -2: bne 2b - subq 1, $r7 - cmp.d $r2, $r3 ; Last command? - bne 1b - nop - - ; Start refresh - move.d CONFIG_ETRAX_SDRAM_TIMING, $r1 - move.d REG_ADDR(bif_core, regi_bif_core, rw_sdram_timing), $r0 - move.d $r1, [$r0] - - ; Initialization finished - ba _sdram_commands_end - nop - -_sdram_commands_start: - .byte regk_bif_core_pre ; Precharge - .byte regk_bif_core_ref ; refresh - .byte regk_bif_core_ref ; refresh - .byte regk_bif_core_ref ; refresh - .byte regk_bif_core_ref ; refresh - .byte regk_bif_core_ref ; refresh - .byte regk_bif_core_ref ; refresh - .byte regk_bif_core_ref ; refresh - .byte regk_bif_core_ref ; refresh - .byte regk_bif_core_mrs ; mrs -_sdram_commands_end: diff --git a/arch/cris/arch-v32/lib/hw_settings.S b/arch/cris/arch-v32/lib/hw_settings.S deleted file mode 100644 index fff9443513d..00000000000 --- a/arch/cris/arch-v32/lib/hw_settings.S +++ /dev/null @@ -1,72 +0,0 @@ -/* - * $Id: hw_settings.S,v 1.3 2005/04/24 18:36:57 starvik Exp $ - * - * This table is used by some tools to extract hardware parameters. - * The table should be included in the kernel and the decompressor. - * Don't forget to update the tools if you change this table. - * - * Copyright (C) 2001 Axis Communications AB - * - * Authors: Mikael Starvik (starvik@axis.com) - */ - -#include <asm/arch/hwregs/asm/reg_map_asm.h> -#include <asm/arch/hwregs/asm/bif_core_defs_asm.h> -#include <asm/arch/hwregs/asm/gio_defs_asm.h> - - .ascii "HW_PARAM_MAGIC" ; Magic number - .dword 0xc0004000 ; Kernel start address - - ; Debug port -#ifdef CONFIG_ETRAX_DEBUG_PORT0 - .dword 0 -#elif defined(CONFIG_ETRAX_DEBUG_PORT1) - .dword 1 -#elif defined(CONFIG_ETRAX_DEBUG_PORT2) - .dword 2 -#elif defined(CONFIG_ETRAX_DEBUG_PORT3) - .dword 3 -#else - .dword 4 ; No debug -#endif - - ; Register values - .dword REG_ADDR(bif_core, regi_bif_core, rw_grp1_cfg) - .dword CONFIG_ETRAX_MEM_GRP1_CONFIG - .dword REG_ADDR(bif_core, regi_bif_core, rw_grp2_cfg) - .dword CONFIG_ETRAX_MEM_GRP2_CONFIG - .dword REG_ADDR(bif_core, regi_bif_core, rw_grp3_cfg) - .dword CONFIG_ETRAX_MEM_GRP3_CONFIG - .dword REG_ADDR(bif_core, regi_bif_core, rw_grp4_cfg) - .dword CONFIG_ETRAX_MEM_GRP4_CONFIG - .dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp0) - .dword CONFIG_ETRAX_SDRAM_GRP0_CONFIG - .dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_cfg_grp1) - .dword CONFIG_ETRAX_SDRAM_GRP1_CONFIG - .dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_timing) - .dword CONFIG_ETRAX_SDRAM_TIMING - .dword REG_ADDR(bif_core, regi_bif_core, rw_sdram_cmd) - .dword CONFIG_ETRAX_SDRAM_COMMAND - - .dword REG_ADDR(gio, regi_gio, rw_pa_dout) - .dword CONFIG_ETRAX_DEF_GIO_PA_OUT - .dword REG_ADDR(gio, regi_gio, rw_pa_oe) - .dword CONFIG_ETRAX_DEF_GIO_PA_OE - .dword REG_ADDR(gio, regi_gio, rw_pb_dout) - .dword CONFIG_ETRAX_DEF_GIO_PB_OUT - .dword REG_ADDR(gio, regi_gio, rw_pb_oe) - .dword CONFIG_ETRAX_DEF_GIO_PB_OE - .dword REG_ADDR(gio, regi_gio, rw_pc_dout) - .dword CONFIG_ETRAX_DEF_GIO_PC_OUT - .dword REG_ADDR(gio, regi_gio, rw_pc_oe) - .dword CONFIG_ETRAX_DEF_GIO_PC_OE - .dword REG_ADDR(gio, regi_gio, rw_pd_dout) - .dword CONFIG_ETRAX_DEF_GIO_PD_OUT - .dword REG_ADDR(gio, regi_gio, rw_pd_oe) - .dword CONFIG_ETRAX_DEF_GIO_PD_OE - .dword REG_ADDR(gio, regi_gio, rw_pe_dout) - .dword CONFIG_ETRAX_DEF_GIO_PE_OUT - .dword REG_ADDR(gio, regi_gio, rw_pe_oe) - .dword CONFIG_ETRAX_DEF_GIO_PE_OE - - .dword 0 ; No more register values diff --git a/arch/cris/arch-v32/lib/memset.c b/arch/cris/arch-v32/lib/memset.c index ffca1214674..c94ea9b3ec2 100644 --- a/arch/cris/arch-v32/lib/memset.c +++ b/arch/cris/arch-v32/lib/memset.c @@ -1,253 +1,259 @@ -/*#************************************************************************#*/ -/*#-------------------------------------------------------------------------*/ -/*# */ -/*# FUNCTION NAME: memset() */ -/*# */ -/*# PARAMETERS: void* dst; Destination address. */ -/*# int c; Value of byte to write. */ -/*# int len; Number of bytes to write. */ -/*# */ -/*# RETURNS: dst. */ -/*# */ -/*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */ -/*# Framework taken from memcpy. This routine is */ -/*# very sensitive to compiler changes in register allocation. */ -/*# Should really be rewritten to avoid this problem. */ -/*# */ -/*#-------------------------------------------------------------------------*/ -/*# */ -/*# HISTORY */ -/*# */ -/*# DATE NAME CHANGES */ -/*# ---- ---- ------- */ -/*# 990713 HP Tired of watching this function (or */ -/*# really, the nonoptimized generic */ -/*# implementation) take up 90% of simulator */ -/*# output. Measurements needed. */ -/*# */ -/*#-------------------------------------------------------------------------*/ - -#include <linux/types.h> - -/* No, there's no macro saying 12*4, since it is "hard" to get it into - the asm in a good way. Thus better to expose the problem everywhere. - */ - -/* Assuming 1 cycle per dword written or read (ok, not really true), and - one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1) - so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */ - -#define ZERO_BLOCK_SIZE (1*12*4) - -void *memset(void *pdst, - int c, - size_t plen) +/* A memset for CRIS. + Copyright (C) 1999-2005 Axis Communications. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Neither the name of Axis Communications nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS + COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. */ + +/* FIXME: This file should really only be used for reference, as the + result is somewhat depending on gcc generating what we expect rather + than what we describe. An assembly file should be used instead. */ + +/* Note the multiple occurrence of the expression "12*4", including the + asm. It is hard to get it into the asm in a good way. Thus better to + expose the problem everywhere: no macro. */ + +/* Assuming one cycle per dword written or read (ok, not really true; the + world is not ideal), and one cycle per instruction, then 43+3*(n/48-1) + <= 24+24*(n/48-1) so n >= 45.7; n >= 0.9; we win on the first full + 48-byte block to set. */ + +#define MEMSET_BY_BLOCK_THRESHOLD (1 * 48) + +/* No name ambiguities in this file. */ +__asm__ (".syntax no_register_prefix"); + +void *memset(void *pdst, int c, unsigned int plen) { - /* Ok. Now we want the parameters put in special registers. - Make sure the compiler is able to make something useful of this. */ + /* Now we want the parameters in special registers. Make sure the + compiler does something usable with this. */ register char *return_dst __asm__ ("r10") = pdst; register int n __asm__ ("r12") = plen; register int lc __asm__ ("r11") = c; - /* Most apps use memset sanely. Only those memsetting about 3..4 - bytes or less get penalized compared to the generic implementation - - and that's not really sane use. */ + /* Most apps use memset sanely. Memsetting about 3..4 bytes or less get + penalized here compared to the generic implementation. */ - /* Ugh. This is fragile at best. Check with newer GCC releases, if - they compile cascaded "x |= x << 8" sanely! */ - __asm__("movu.b %0,$r13 \n\ - lslq 8,$r13 \n\ - move.b %0,$r13 \n\ - move.d $r13,%0 \n\ - lslq 16,$r13 \n\ - or.d $r13,%0" - : "=r" (lc) : "0" (lc) : "r13"); + /* This is fragile performancewise at best. Check with newer GCC + releases, if they compile cascaded "x |= x << 8" to sane code. */ + __asm__("movu.b %0,r13 \n\ + lslq 8,r13 \n\ + move.b %0,r13 \n\ + move.d r13,%0 \n\ + lslq 16,r13 \n\ + or.d r13,%0" + : "=r" (lc) /* Inputs. */ + : "0" (lc) /* Outputs. */ + : "r13"); /* Trash. */ { register char *dst __asm__ ("r13") = pdst; - /* This is NONPORTABLE, but since this whole routine is */ - /* grossly nonportable that doesn't matter. */ + if (((unsigned long) pdst & 3) != 0 + /* Oops! n = 0 must be a valid call, regardless of alignment. */ + && n >= 3) + { + if ((unsigned long) dst & 1) + { + *dst = (char) lc; + n--; + dst++; + } - if (((unsigned long) pdst & 3) != 0 - /* Oops! n=0 must be a legal call, regardless of alignment. */ - && n >= 3) - { - if ((unsigned long)dst & 1) - { - *dst = (char) lc; - n--; - dst++; - } - - if ((unsigned long)dst & 2) - { - *(short *)dst = lc; - n -= 2; - dst += 2; - } - } + if ((unsigned long) dst & 2) + { + *(short *) dst = lc; + n -= 2; + dst += 2; + } + } - /* Now the fun part. For the threshold value of this, check the equation - above. */ - /* Decide which copying method to use. */ - if (n >= ZERO_BLOCK_SIZE) - { - /* For large copies we use 'movem' */ - - /* It is not optimal to tell the compiler about clobbering any - registers; that will move the saving/restoring of those registers - to the function prologue/epilogue, and make non-movem sizes - suboptimal. - - This method is not foolproof; it assumes that the "asm reg" - declarations at the beginning of the function really are used - here (beware: they may be moved to temporary registers). - This way, we do not have to save/move the registers around into - temporaries; we can safely use them straight away. - - If you want to check that the allocation was right; then - check the equalities in the first comment. It should say - "r13=r13, r12=r12, r11=r11" */ - __asm__ volatile (" \n\ - ;; Check that the register asm declaration got right. \n\ - ;; The GCC manual says it will work, but there *has* been bugs. \n\ - .ifnc %0-%1-%4,$r13-$r12-$r11 \n\ - .err \n\ - .endif \n\ + /* Decide which setting method to use. */ + if (n >= MEMSET_BY_BLOCK_THRESHOLD) + { + /* It is not optimal to tell the compiler about clobbering any + registers; that will move the saving/restoring of those registers + to the function prologue/epilogue, and make non-block sizes + suboptimal. */ + __asm__ volatile + ("\ + ;; GCC does promise correct register allocations, but let's \n\ + ;; make sure it keeps its promises. \n\ + .ifnc %0-%1-%4,$r13-$r12-$r11 \n\ + .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\ + .endif \n\ \n\ - ;; Save the registers we'll clobber in the movem process \n\ - ;; on the stack. Don't mention them to gcc, it will only be \n\ - ;; upset. \n\ - subq 11*4,$sp \n\ - movem $r10,[$sp] \n\ + ;; Save the registers we'll clobber in the movem process \n\ + ;; on the stack. Don't mention them to gcc, it will only be \n\ + ;; upset. \n\ + subq 11*4,sp \n\ + movem r10,[sp] \n\ \n\ - move.d $r11,$r0 \n\ - move.d $r11,$r1 \n\ - move.d $r11,$r2 \n\ - move.d $r11,$r3 \n\ - move.d $r11,$r4 \n\ - move.d $r11,$r5 \n\ - move.d $r11,$r6 \n\ - move.d $r11,$r7 \n\ - move.d $r11,$r8 \n\ - move.d $r11,$r9 \n\ - move.d $r11,$r10 \n\ + move.d r11,r0 \n\ + move.d r11,r1 \n\ + move.d r11,r2 \n\ + move.d r11,r3 \n\ + move.d r11,r4 \n\ + move.d r11,r5 \n\ + move.d r11,r6 \n\ + move.d r11,r7 \n\ + move.d r11,r8 \n\ + move.d r11,r9 \n\ + move.d r11,r10 \n\ \n\ - ;; Now we've got this: \n\ - ;; r13 - dst \n\ - ;; r12 - n \n\ + ;; Now we've got this: \n\ + ;; r13 - dst \n\ + ;; r12 - n \n\ \n\ - ;; Update n for the first loop \n\ - subq 12*4,$r12 \n\ + ;; Update n for the first loop \n\ + subq 12*4,r12 \n\ 0: \n\ - subq 12*4,$r12 \n\ - bge 0b \n\ - movem $r11,[$r13+] \n\ +" +#ifdef __arch_common_v10_v32 + /* Cater to branch offset difference between v32 and v10. We + assume the branch below has an 8-bit offset. */ +" setf\n" +#endif +" subq 12*4,r12 \n\ + bge 0b \n\ + movem r11,[r13+] \n\ \n\ - addq 12*4,$r12 ;; compensate for last loop underflowing n \n\ + ;; Compensate for last loop underflowing n. \n\ + addq 12*4,r12 \n\ \n\ - ;; Restore registers from stack \n\ - movem [$sp+],$r10" + ;; Restore registers from stack. \n\ + movem [sp+],r10" - /* Outputs */ : "=r" (dst), "=r" (n) - /* Inputs */ : "0" (dst), "1" (n), "r" (lc)); - } + /* Outputs. */ + : "=r" (dst), "=r" (n) + + /* Inputs. */ + : "0" (dst), "1" (n), "r" (lc)); + } + + /* An ad-hoc unroll, used for 4*12-1..16 bytes. */ + while (n >= 16) + { + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + n -= 16; + } - /* Either we directly starts copying, using dword copying - in a loop, or we copy as much as possible with 'movem' - and then the last block (<44 bytes) is copied here. - This will work since 'movem' will have updated src,dst,n. */ - - while ( n >= 16 ) - { - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - n -= 16; - } - - /* A switch() is definitely the fastest although it takes a LOT of code. - * Particularly if you inline code this. - */ switch (n) - { + { case 0: break; + case 1: - *(char*)dst = (char) lc; + *dst = (char) lc; break; + case 2: - *(short*)dst = (short) lc; + *(short *) dst = (short) lc; break; + case 3: - *((short*)dst)++ = (short) lc; - *(char*)dst = (char) lc; + *(short *) dst = (short) lc; dst += 2; + *dst = (char) lc; break; + case 4: - *((long*)dst)++ = lc; + *(long *) dst = lc; break; + case 5: - *((long*)dst)++ = lc; - *(char*)dst = (char) lc; + *(long *) dst = lc; dst += 4; + *dst = (char) lc; break; + case 6: - *((long*)dst)++ = lc; - *(short*)dst = (short) lc; + *(long *) dst = lc; dst += 4; + *(short *) dst = (short) lc; break; + case 7: - *((long*)dst)++ = lc; - *((short*)dst)++ = (short) lc; - *(char*)dst = (char) lc; + *(long *) dst = lc; dst += 4; + *(short *) dst = (short) lc; dst += 2; + *dst = (char) lc; break; + case 8: - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; break; + case 9: - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - *(char*)dst = (char) lc; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + *dst = (char) lc; break; + case 10: - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - *(short*)dst = (short) lc; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + *(short *) dst = (short) lc; break; + case 11: - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - *((short*)dst)++ = (short) lc; - *(char*)dst = (char) lc; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + *(short *) dst = (short) lc; dst += 2; + *dst = (char) lc; break; + case 12: - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; break; + case 13: - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - *(char*)dst = (char) lc; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + *dst = (char) lc; break; + case 14: - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - *(short*)dst = (short) lc; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + *(short *) dst = (short) lc; break; + case 15: - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - *((long*)dst)++ = lc; - *((short*)dst)++ = (short) lc; - *(char*)dst = (char) lc; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + *(long *) dst = lc; dst += 4; + *(short *) dst = (short) lc; dst += 2; + *dst = (char) lc; break; - } + } } - return return_dst; /* destination pointer. */ -} /* memset() */ + return return_dst; +} diff --git a/arch/cris/arch-v32/lib/nand_init.S b/arch/cris/arch-v32/lib/nand_init.S deleted file mode 100644 index e019816facd..00000000000 --- a/arch/cris/arch-v32/lib/nand_init.S +++ /dev/null @@ -1,178 +0,0 @@ -##============================================================================= -## -## nand_init.S -## -## The bootrom copies data from the NAND flash to the internal RAM but -## due to a bug/feature we can only trust the 256 first bytes. So this -## code copies more data from NAND flash to internal RAM. Obvioulsy this -## code must fit in the first 256 bytes so alter with care. -## -## Some notes about the bug/feature for future reference: -## The bootrom copies the first 127 KB from NAND flash to internal -## memory. The problem is that it does a bytewise copy. NAND flashes -## does autoincrement on the address so for a 16-bite device each -## read/write increases the address by two. So the copy loop in the -## bootrom will discard every second byte. This is solved by inserting -## zeroes in every second byte in the first erase block. -## -## The bootrom also incorrectly assumes that it can read the flash -## linear with only one read command but the flash will actually -## switch between normal area and spare area if you do that so we -## can't trust more than the first 256 bytes. -## -##============================================================================= - -#include <asm/arch/hwregs/asm/reg_map_asm.h> -#include <asm/arch/hwregs/asm/gio_defs_asm.h> -#include <asm/arch/hwregs/asm/pinmux_defs_asm.h> -#include <asm/arch/hwregs/asm/bif_core_defs_asm.h> -#include <asm/arch/hwregs/asm/config_defs_asm.h> - -;; There are 8-bit NAND flashes and 16-bit NAND flashes. -;; We need to treat them slightly different. -#if CONFIG_ETRAX_FLASH_BUSWIDTH==2 -#define PAGE_SIZE 256 -#else -#error 2 -#define PAGE_SIZE 512 -#endif -#define ERASE_BLOCK 16384 - -;; GPIO pins connected to NAND flash -#define CE 4 -#define CLE 5 -#define ALE 6 -#define BY 7 - -;; Address space for NAND flash -#define NAND_RD_ADDR 0x90000000 -#define NAND_WR_ADDR 0x94000000 - -#define READ_CMD 0x00 - -;; Readability macros -#define CSP_MASK \ - REG_MASK(bif_core, rw_grp3_cfg, gated_csp0) | \ - REG_MASK(bif_core, rw_grp3_cfg, gated_csp1) -#define CSP_VAL \ - REG_STATE(bif_core, rw_grp3_cfg, gated_csp0, rd) | \ - REG_STATE(bif_core, rw_grp3_cfg, gated_csp1, wr) - -;;---------------------------------------------------------------------------- -;; Macros to set/clear GPIO bits - -.macro SET x - or.b (1<<\x),$r9 - move.d $r9, [$r2] -.endm - -.macro CLR x - and.b ~(1<<\x),$r9 - move.d $r9, [$r2] -.endm - -;;---------------------------------------------------------------------------- - -nand_boot: - ;; Check if nand boot was selected - move.d REG_ADDR(config, regi_config, r_bootsel), $r0 - move.d [$r0], $r0 - and.d REG_MASK(config, r_bootsel, boot_mode), $r0 - cmp.d REG_STATE(config, r_bootsel, boot_mode, nand), $r0 - bne normal_boot ; No NAND boot - nop - -copy_nand_to_ram: - ;; copy_nand_to_ram - ;; Arguments - ;; r10 - destination - ;; r11 - source offset - ;; r12 - size - ;; r13 - Address to jump to after completion - ;; Note : r10-r12 are clobbered on return - ;; Registers used: - ;; r0 - NAND_RD_ADDR - ;; r1 - NAND_WR_ADDR - ;; r2 - reg_gio_rw_pa_dout - ;; r3 - reg_gio_r_pa_din - ;; r4 - tmp - ;; r5 - byte counter within a page - ;; r6 - reg_pinmux_rw_pa - ;; r7 - reg_gio_rw_pa_oe - ;; r8 - reg_bif_core_rw_grp3_cfg - ;; r9 - reg_gio_rw_pa_dout shadow - move.d 0x90000000, $r0 - move.d 0x94000000, $r1 - move.d REG_ADDR(gio, regi_gio, rw_pa_dout), $r2 - move.d REG_ADDR(gio, regi_gio, r_pa_din), $r3 - move.d REG_ADDR(pinmux, regi_pinmux, rw_pa), $r6 - move.d REG_ADDR(gio, regi_gio, rw_pa_oe), $r7 - move.d REG_ADDR(bif_core, regi_bif_core, rw_grp3_cfg), $r8 - -#if CONFIG_ETRAX_FLASH_BUSWIDTH==2 - lsrq 1, $r11 -#endif - ;; Set up GPIO - move.d [$r2], $r9 - move.d [$r7], $r4 - or.b (1<<ALE) | (1 << CLE) | (1<<CE), $r4 - move.d $r4, [$r7] - - ;; Set up bif - move.d [$r8], $r4 - and.d CSP_MASK, $r4 - or.d CSP_VAL, $r4 - move.d $r4, [$r8] - -1: ;; Copy one page - CLR CE - SET CLE - moveq READ_CMD, $r4 - move.b $r4, [$r1] - moveq 20, $r4 -2: bne 2b - subq 1, $r4 - CLR CLE - SET ALE - clear.w [$r1] ; Column address = 0 - move.d $r11, $r4 - lsrq 8, $r4 - move.b $r4, [$r1] ; Row address - lsrq 8, $r4 - move.b $r4, [$r1] ; Row adddress - moveq 20, $r4 -2: bne 2b - subq 1, $r4 - CLR ALE -2: move.d [$r3], $r4 - and.d 1 << BY, $r4 - beq 2b - movu.w PAGE_SIZE, $r5 -2: ; Copy one byte/word -#if CONFIG_ETRAX_FLASH_BUSWIDTH==2 - move.w [$r0], $r4 -#else - move.b [$r0], $r4 -#endif - subq 1, $r5 - bne 2b -#if CONFIG_ETRAX_FLASH_BUSWIDTH==2 - move.w $r4, [$r10+] - subu.w PAGE_SIZE*2, $r12 -#else - move.b $r4, [$r10+] - subu.w PAGE_SIZE, $r12 -#endif - bpl 1b - addu.w PAGE_SIZE, $r11 - - ;; End of copy - jump $r13 - nop - - ;; This will warn if the code above is too large. If you consider - ;; to remove this you don't understand the bug/feature. - .org 256 - .org ERASE_BLOCK - -normal_boot: diff --git a/arch/cris/arch-v32/lib/spinlock.S b/arch/cris/arch-v32/lib/spinlock.S index 2437ae7f6ed..fe610b9d775 100644 --- a/arch/cris/arch-v32/lib/spinlock.S +++ b/arch/cris/arch-v32/lib/spinlock.S @@ -6,28 +6,35 @@ .global cris_spin_lock + .type cris_spin_lock,@function .global cris_spin_trylock + .type cris_spin_trylock,@function .text cris_spin_lock: clearf p -1: test.d [$r10] +1: test.b [$r10] beq 1b clearf p ax - clear.d [$r10] + clear.b [$r10] bcs 1b clearf p ret nop + .size cris_spin_lock, . - cris_spin_lock + cris_spin_trylock: clearf p -1: move.d [$r10], $r11 +1: move.b [$r10], $r11 ax - clear.d [$r10] + clear.b [$r10] bcs 1b clearf p ret - move.d $r11,$r10 + movu.b $r11,$r10 + + .size cris_spin_trylock, . - cris_spin_trylock + diff --git a/arch/cris/arch-v32/lib/strcmp.S b/arch/cris/arch-v32/lib/strcmp.S new file mode 100644 index 00000000000..8f7a1ee6259 --- /dev/null +++ b/arch/cris/arch-v32/lib/strcmp.S @@ -0,0 +1,21 @@ +; strcmp.S -- CRISv32 version. +; Copyright (C) 2008 AXIS Communications AB +; Written by Edgar E. Iglesias +; +; This source code is licensed under the GNU General Public License, +; Version 2. See the file COPYING for more details. + + .global strcmp + .type strcmp,@function +strcmp: +1: + move.b [$r10+], $r12 + seq $r13 + sub.b [$r11+], $r12 + or.b $r12, $r13 + beq 1b + nop + + ret + movs.b $r12, $r10 + .size strcmp, . - strcmp diff --git a/arch/cris/arch-v32/lib/string.c b/arch/cris/arch-v32/lib/string.c index 98e282ac824..c7bd6ebdc93 100644 --- a/arch/cris/arch-v32/lib/string.c +++ b/arch/cris/arch-v32/lib/string.c @@ -1,55 +1,59 @@ -/*#************************************************************************#*/ -/*#-------------------------------------------------------------------------*/ -/*# */ -/*# FUNCTION NAME: memcpy() */ -/*# */ -/*# PARAMETERS: void* dst; Destination address. */ -/*# void* src; Source address. */ -/*# int len; Number of bytes to copy. */ -/*# */ -/*# RETURNS: dst. */ -/*# */ -/*# DESCRIPTION: Copies len bytes of memory from src to dst. No guarantees */ -/*# about copying of overlapping memory areas. This routine is */ -/*# very sensitive to compiler changes in register allocation. */ -/*# Should really be rewritten to avoid this problem. */ -/*# */ -/*#-------------------------------------------------------------------------*/ -/*# */ -/*# HISTORY */ -/*# */ -/*# DATE NAME CHANGES */ -/*# ---- ---- ------- */ -/*# 941007 Kenny R Creation */ -/*# 941011 Kenny R Lots of optimizations and inlining. */ -/*# 941129 Ulf A Adapted for use in libc. */ -/*# 950216 HP N==0 forgotten if non-aligned src/dst. */ -/*# Added some optimizations. */ -/*# 001025 HP Make src and dst char *. Align dst to */ -/*# dword, not just word-if-both-src-and-dst- */ -/*# are-misaligned. */ -/*# */ -/*#-------------------------------------------------------------------------*/ - -#include <linux/types.h> - -void *memcpy(void *pdst, - const void *psrc, - size_t pn) +/* A memcpy for CRIS. + Copyright (C) 1994-2005 Axis Communications. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Neither the name of Axis Communications nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS + COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. */ + +/* FIXME: This file should really only be used for reference, as the + result is somewhat depending on gcc generating what we expect rather + than what we describe. An assembly file should be used instead. */ + +#include <stddef.h> + +/* Break even between movem and move16 is really at 38.7 * 2, but + modulo 44, so up to the next multiple of 44, we use ordinary code. */ +#define MEMCPY_BY_BLOCK_THRESHOLD (44 * 2) + +/* No name ambiguities in this file. */ +__asm__ (".syntax no_register_prefix"); + +void * +memcpy(void *pdst, const void *psrc, size_t pn) { - /* Ok. Now we want the parameters put in special registers. + /* Now we want the parameters put in special registers. Make sure the compiler is able to make something useful of this. - As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). + As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). If gcc was allright, it really would need no temporaries, and no - stack space to save stuff on. */ + stack space to save stuff on. */ register void *return_dst __asm__ ("r10") = pdst; - register char *dst __asm__ ("r13") = pdst; - register const char *src __asm__ ("r11") = psrc; + register unsigned char *dst __asm__ ("r13") = pdst; + register unsigned const char *src __asm__ ("r11") = psrc; register int n __asm__ ("r12") = pn; - /* When src is aligned but not dst, this makes a few extra needless cycles. I believe it would take as many to check that the re-alignment was unnecessary. */ @@ -59,161 +63,174 @@ void *memcpy(void *pdst, && n >= 3) { if ((unsigned long) dst & 1) - { - n--; - *(char*)dst = *(char*)src; - src++; - dst++; - } + { + n--; + *dst = *src; + src++; + dst++; + } if ((unsigned long) dst & 2) - { - n -= 2; - *(short*)dst = *(short*)src; - src += 2; - dst += 2; - } + { + n -= 2; + *(short *) dst = *(short *) src; + src += 2; + dst += 2; + } } - /* Decide which copying method to use. Movem is dirt cheap, so the - overheap is low enough to always use the minimum block size as the - threshold. */ - if (n >= 44) - { - /* For large copies we use 'movem' */ - - /* It is not optimal to tell the compiler about clobbering any - registers; that will move the saving/restoring of those registers - to the function prologue/epilogue, and make non-movem sizes - suboptimal. */ - __asm__ volatile (" \n\ - ;; Check that the register asm declaration got right. \n\ - ;; The GCC manual explicitly says TRT will happen. \n\ - .ifnc %0-%1-%2,$r13-$r11-$r12 \n\ - .err \n\ - .endif \n\ - \n\ - ;; Save the registers we'll use in the movem process \n\ + /* Decide which copying method to use. */ + if (n >= MEMCPY_BY_BLOCK_THRESHOLD) + { + /* It is not optimal to tell the compiler about clobbering any + registers; that will move the saving/restoring of those registers + to the function prologue/epilogue, and make non-movem sizes + suboptimal. */ + __asm__ volatile + ("\ + ;; GCC does promise correct register allocations, but let's \n\ + ;; make sure it keeps its promises. \n\ + .ifnc %0-%1-%2,$r13-$r11-$r12 \n\ + .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\ + .endif \n\ \n\ - ;; on the stack. \n\ - subq 11*4,$sp \n\ - movem $r10,[$sp] \n\ + ;; Save the registers we'll use in the movem process \n\ + ;; on the stack. \n\ + subq 11*4,sp \n\ + movem r10,[sp] \n\ \n\ - ;; Now we've got this: \n\ - ;; r11 - src \n\ - ;; r13 - dst \n\ - ;; r12 - n \n\ + ;; Now we've got this: \n\ + ;; r11 - src \n\ + ;; r13 - dst \n\ + ;; r12 - n \n\ \n\ - ;; Update n for the first loop \n\ - subq 44,$r12 \n\ + ;; Update n for the first loop. \n\ + subq 44,r12 \n\ 0: \n\ - movem [$r11+],$r10 \n\ - subq 44,$r12 \n\ - bge 0b \n\ - movem $r10,[$r13+] \n\ +" +#ifdef __arch_common_v10_v32 + /* Cater to branch offset difference between v32 and v10. We + assume the branch below has an 8-bit offset. */ +" setf\n" +#endif +" movem [r11+],r10 \n\ + subq 44,r12 \n\ + bge 0b \n\ + movem r10,[r13+] \n\ \n\ - addq 44,$r12 ;; compensate for last loop underflowing n \n\ + ;; Compensate for last loop underflowing n. \n\ + addq 44,r12 \n\ \n\ - ;; Restore registers from stack \n\ - movem [$sp+],$r10" + ;; Restore registers from stack. \n\ + movem [sp+],r10" - /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n) - /* Inputs */ : "0" (dst), "1" (src), "2" (n)); + /* Outputs. */ + : "=r" (dst), "=r" (src), "=r" (n) - } + /* Inputs. */ + : "0" (dst), "1" (src), "2" (n)); + } - /* Either we directly starts copying, using dword copying - in a loop, or we copy as much as possible with 'movem' - and then the last block (<44 bytes) is copied here. - This will work since 'movem' will have updated src,dst,n. */ + while (n >= 16) + { + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; - while ( n >= 16 ) - { - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - n -= 16; - } + n -= 16; + } - /* A switch() is definitely the fastest although it takes a LOT of code. - * Particularly if you inline code this. - */ switch (n) - { + { case 0: break; + case 1: - *(char*)dst = *(char*)src; + *dst = *src; break; + case 2: - *(short*)dst = *(short*)src; + *(short *) dst = *(short *) src; break; + case 3: - *((short*)dst)++ = *((short*)src)++; - *(char*)dst = *(char*)src; + *(short *) dst = *(short *) src; dst += 2; src += 2; + *dst = *src; break; + case 4: - *((long*)dst)++ = *((long*)src)++; + *(long *) dst = *(long *) src; break; + case 5: - *((long*)dst)++ = *((long*)src)++; - *(char*)dst = *(char*)src; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *dst = *src; break; + case 6: - *((long*)dst)++ = *((long*)src)++; - *(short*)dst = *(short*)src; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(short *) dst = *(short *) src; break; + case 7: - *((long*)dst)++ = *((long*)src)++; - *((short*)dst)++ = *((short*)src)++; - *(char*)dst = *(char*)src; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(short *) dst = *(short *) src; dst += 2; src += 2; + *dst = *src; break; + case 8: - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; break; + case 9: - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - *(char*)dst = *(char*)src; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *dst = *src; break; + case 10: - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - *(short*)dst = *(short*)src; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(short *) dst = *(short *) src; break; + case 11: - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - *((short*)dst)++ = *((short*)src)++; - *(char*)dst = *(char*)src; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(short *) dst = *(short *) src; dst += 2; src += 2; + *dst = *src; break; + case 12: - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; break; + case 13: - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - *(char*)dst = *(char*)src; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *dst = *src; break; + case 14: - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - *(short*)dst = *(short*)src; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(short *) dst = *(short *) src; break; + case 15: - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - *((long*)dst)++ = *((long*)src)++; - *((short*)dst)++ = *((short*)src)++; - *(char*)dst = *(char*)src; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(long *) dst = *(long *) src; dst += 4; src += 4; + *(short *) dst = *(short *) src; dst += 2; src += 2; + *dst = *src; break; - } + } - return return_dst; /* destination pointer. */ -} /* memcpy() */ + return return_dst; +} diff --git a/arch/cris/arch-v32/lib/usercopy.c b/arch/cris/arch-v32/lib/usercopy.c index f0b08460c1b..0b5b70d5f58 100644 --- a/arch/cris/arch-v32/lib/usercopy.c +++ b/arch/cris/arch-v32/lib/usercopy.c @@ -34,7 +34,7 @@ __copy_user (void __user *pdst, const void *psrc, unsigned long pn) As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). FIXME: Comment for old gcc version. Check. - If gcc was allright, it really would need no temporaries, and no + If gcc was alright, it really would need no temporaries, and no stack space to save stuff on. */ register char *dst __asm__ ("r13") = pdst; @@ -161,14 +161,14 @@ __copy_user (void __user *pdst, const void *psrc, unsigned long pn) inaccessible. */ unsigned long -__copy_user_zeroing (void __user *pdst, const void *psrc, unsigned long pn) +__copy_user_zeroing(void *pdst, const void __user *psrc, unsigned long pn) { /* We want the parameters put in special registers. Make sure the compiler is able to make something useful of this. As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). FIXME: Comment for old gcc version. Check. - If gcc was allright, it really would need no temporaries, and no + If gcc was alright, it really would need no temporaries, and no stack space to save stuff on. */ register char *dst __asm__ ("r13") = pdst; @@ -332,7 +332,7 @@ __do_clear_user (void __user *pto, unsigned long pn) As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). FIXME: Comment for old gcc version. Check. - If gcc was allright, it really would need no temporaries, and no + If gcc was alright, it really would need no temporaries, and no stack space to save stuff on. */ register char *dst __asm__ ("r13") = pto; |
