diff options
author | Chris Zankel <czankel@tensilica.com> | 2005-06-23 22:01:20 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-06-24 00:05:21 -0700 |
commit | 249ac17e96811acc3c6402317dd5d5c89d2cbf68 (patch) | |
tree | 0a174065460de196861b85f1d9a48c88b2a2675a /arch/xtensa/lib | |
parent | 5a0015d62668e64c8b6e02e360fbbea121bfd5e6 (diff) |
[PATCH] xtensa: Architecture support for Tensilica Xtensa Part 4
The attached patches provides part 4 of an architecture implementation for the
Tensilica Xtensa CPU series.
Signed-off-by: Chris Zankel <chris@zankel.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/xtensa/lib')
-rw-r--r-- | arch/xtensa/lib/Makefile | 7 | ||||
-rw-r--r-- | arch/xtensa/lib/checksum.S | 410 | ||||
-rw-r--r-- | arch/xtensa/lib/memcopy.S | 315 | ||||
-rw-r--r-- | arch/xtensa/lib/memset.S | 160 | ||||
-rw-r--r-- | arch/xtensa/lib/pci-auto.c | 352 | ||||
-rw-r--r-- | arch/xtensa/lib/strcasecmp.c | 32 | ||||
-rw-r--r-- | arch/xtensa/lib/strncpy_user.S | 224 | ||||
-rw-r--r-- | arch/xtensa/lib/strnlen_user.S | 147 | ||||
-rw-r--r-- | arch/xtensa/lib/usercopy.S | 321 |
9 files changed, 1968 insertions, 0 deletions
diff --git a/arch/xtensa/lib/Makefile b/arch/xtensa/lib/Makefile new file mode 100644 index 00000000000..ed935b58e8a --- /dev/null +++ b/arch/xtensa/lib/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for Xtensa-specific library files. +# + +lib-y += memcopy.o memset.o checksum.o strcasecmp.o \ + usercopy.o strncpy_user.o strnlen_user.o +lib-$(CONFIG_PCI) += pci-auto.o diff --git a/arch/xtensa/lib/checksum.S b/arch/xtensa/lib/checksum.S new file mode 100644 index 00000000000..e2d64dfd530 --- /dev/null +++ b/arch/xtensa/lib/checksum.S @@ -0,0 +1,410 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea + * Optimized by Joe Taylor + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/errno.h> +#include <linux/linkage.h> +#define _ASMLANGUAGE +#include <xtensa/config/core.h> + +/* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ + +/* + * unsigned int csum_partial(const unsigned char *buf, int len, + * unsigned int sum); + * a2 = buf + * a3 = len + * a4 = sum + * + * This function assumes 2- or 4-byte alignment. Other alignments will fail! + */ + +/* ONES_ADD converts twos-complement math to ones-complement. */ +#define ONES_ADD(sum, val) \ + add sum, sum, val ; \ + bgeu sum, val, 99f ; \ + addi sum, sum, 1 ; \ +99: ; + +.text +ENTRY(csum_partial) + /* + * Experiments with Ethernet and SLIP connections show that buf + * is aligned on either a 2-byte or 4-byte boundary. + */ + entry sp, 32 + extui a5, a2, 0, 2 + bnez a5, 8f /* branch if 2-byte aligned */ + /* Fall-through on common case, 4-byte alignment */ +1: + srli a5, a3, 5 /* 32-byte chunks */ +#if XCHAL_HAVE_LOOPS + loopgtz a5, 2f +#else + beqz a5, 2f + slli a5, a5, 5 + add a5, a5, a2 /* a5 = end of last 32-byte chunk */ +.Loop1: +#endif + l32i a6, a2, 0 + l32i a7, a2, 4 + ONES_ADD(a4, a6) + ONES_ADD(a4, a7) + l32i a6, a2, 8 + l32i a7, a2, 12 + ONES_ADD(a4, a6) + ONES_ADD(a4, a7) + l32i a6, a2, 16 + l32i a7, a2, 20 + ONES_ADD(a4, a6) + ONES_ADD(a4, a7) + l32i a6, a2, 24 + l32i a7, a2, 28 + ONES_ADD(a4, a6) + ONES_ADD(a4, a7) + addi a2, a2, 4*8 +#if !XCHAL_HAVE_LOOPS + blt a2, a5, .Loop1 +#endif +2: + extui a5, a3, 2, 3 /* remaining 4-byte chunks */ +#if XCHAL_HAVE_LOOPS + loopgtz a5, 3f +#else + beqz a5, 3f + slli a5, a5, 2 + add a5, a5, a2 /* a5 = end of last 4-byte chunk */ +.Loop2: +#endif + l32i a6, a2, 0 + ONES_ADD(a4, a6) + addi a2, a2, 4 +#if !XCHAL_HAVE_LOOPS + blt a2, a5, .Loop2 +#endif +3: + _bbci.l a3, 1, 5f /* remaining 2-byte chunk */ + l16ui a6, a2, 0 + ONES_ADD(a4, a6) + addi a2, a2, 2 +5: + _bbci.l a3, 0, 7f /* remaining 1-byte chunk */ +6: l8ui a6, a2, 0 +#ifdef __XTENSA_EB__ + slli a6, a6, 8 /* load byte into bits 8..15 */ +#endif + ONES_ADD(a4, a6) +7: + mov a2, a4 + retw + + /* uncommon case, buf is 2-byte aligned */ +8: + beqz a3, 7b /* branch if len == 0 */ + beqi a3, 1, 6b /* branch if len == 1 */ + + extui a5, a2, 0, 1 + bnez a5, 8f /* branch if 1-byte aligned */ + + l16ui a6, a2, 0 /* common case, len >= 2 */ + ONES_ADD(a4, a6) + addi a2, a2, 2 /* adjust buf */ + addi a3, a3, -2 /* adjust len */ + j 1b /* now buf is 4-byte aligned */ + + /* case: odd-byte aligned, len > 1 + * This case is dog slow, so don't give us an odd address. + * (I don't think this ever happens, but just in case.) + */ +8: + srli a5, a3, 2 /* 4-byte chunks */ +#if XCHAL_HAVE_LOOPS + loopgtz a5, 2f +#else + beqz a5, 2f + slli a5, a5, 2 + add a5, a5, a2 /* a5 = end of last 4-byte chunk */ +.Loop3: +#endif + l8ui a6, a2, 0 /* bits 24..31 */ + l16ui a7, a2, 1 /* bits 8..23 */ + l8ui a8, a2, 3 /* bits 0.. 8 */ +#ifdef __XTENSA_EB__ + slli a6, a6, 24 +#else + slli a8, a8, 24 +#endif + slli a7, a7, 8 + or a7, a7, a6 + or a7, a7, a8 + ONES_ADD(a4, a7) + addi a2, a2, 4 +#if !XCHAL_HAVE_LOOPS + blt a2, a5, .Loop3 +#endif +2: + _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */ + l8ui a6, a2, 0 + l8ui a7, a2, 1 +#ifdef __XTENSA_EB__ + slli a6, a6, 8 +#else + slli a7, a7, 8 +#endif + or a7, a7, a6 + ONES_ADD(a4, a7) + addi a2, a2, 2 +3: + j 5b /* branch to handle the remaining byte */ + + + +/* + * Copy from ds while checksumming, otherwise like csum_partial + * + * The macros SRC and DST specify the type of access for the instruction. + * thus we can call a custom exception handler for each access type. + */ + +#define SRC(y...) \ + 9999: y; \ + .section __ex_table, "a"; \ + .long 9999b, 6001f ; \ + .previous + +#define DST(y...) \ + 9999: y; \ + .section __ex_table, "a"; \ + .long 9999b, 6002f ; \ + .previous + +/* +unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, + int sum, int *src_err_ptr, int *dst_err_ptr) + a2 = src + a3 = dst + a4 = len + a5 = sum + a6 = src_err_ptr + a7 = dst_err_ptr + a8 = temp + a9 = temp + a10 = temp + a11 = original len for exception handling + a12 = original dst for exception handling + + This function is optimized for 4-byte aligned addresses. Other + alignments work, but not nearly as efficiently. + */ + +ENTRY(csum_partial_copy_generic) + entry sp, 32 + mov a12, a3 + mov a11, a4 + or a10, a2, a3 + + /* We optimize the following alignment tests for the 4-byte + aligned case. Two bbsi.l instructions might seem more optimal + (commented out below). However, both labels 5: and 3: are out + of the imm8 range, so the assembler relaxes them into + equivalent bbci.l, j combinations, which is actually + slower. */ + + extui a9, a10, 0, 2 + beqz a9, 1f /* branch if both are 4-byte aligned */ + bbsi.l a10, 0, 5f /* branch if one address is odd */ + j 3f /* one address is 2-byte aligned */ + +/* _bbsi.l a10, 0, 5f */ /* branch if odd address */ +/* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */ + +1: + /* src and dst are both 4-byte aligned */ + srli a10, a4, 5 /* 32-byte chunks */ +#if XCHAL_HAVE_LOOPS + loopgtz a10, 2f +#else + beqz a10, 2f + slli a10, a10, 5 + add a10, a10, a2 /* a10 = end of last 32-byte src chunk */ +.Loop5: +#endif +SRC( l32i a9, a2, 0 ) +SRC( l32i a8, a2, 4 ) +DST( s32i a9, a3, 0 ) +DST( s32i a8, a3, 4 ) + ONES_ADD(a5, a9) + ONES_ADD(a5, a8) +SRC( l32i a9, a2, 8 ) +SRC( l32i a8, a2, 12 ) +DST( s32i a9, a3, 8 ) +DST( s32i a8, a3, 12 ) + ONES_ADD(a5, a9) + ONES_ADD(a5, a8) +SRC( l32i a9, a2, 16 ) +SRC( l32i a8, a2, 20 ) +DST( s32i a9, a3, 16 ) +DST( s32i a8, a3, 20 ) + ONES_ADD(a5, a9) + ONES_ADD(a5, a8) +SRC( l32i a9, a2, 24 ) +SRC( l32i a8, a2, 28 ) +DST( s32i a9, a3, 24 ) +DST( s32i a8, a3, 28 ) + ONES_ADD(a5, a9) + ONES_ADD(a5, a8) + addi a2, a2, 32 + addi a3, a3, 32 +#if !XCHAL_HAVE_LOOPS + blt a2, a10, .Loop5 +#endif +2: + extui a10, a4, 2, 3 /* remaining 4-byte chunks */ + extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */ +#if XCHAL_HAVE_LOOPS + loopgtz a10, 3f +#else + beqz a10, 3f + slli a10, a10, 2 + add a10, a10, a2 /* a10 = end of last 4-byte src chunk */ +.Loop6: +#endif +SRC( l32i a9, a2, 0 ) +DST( s32i a9, a3, 0 ) + ONES_ADD(a5, a9) + addi a2, a2, 4 + addi a3, a3, 4 +#if !XCHAL_HAVE_LOOPS + blt a2, a10, .Loop6 +#endif +3: + /* + Control comes to here in two cases: (1) It may fall through + to here from the 4-byte alignment case to process, at most, + one 2-byte chunk. (2) It branches to here from above if + either src or dst is 2-byte aligned, and we process all bytes + here, except for perhaps a trailing odd byte. It's + inefficient, so align your addresses to 4-byte boundaries. + + a2 = src + a3 = dst + a4 = len + a5 = sum + */ + srli a10, a4, 1 /* 2-byte chunks */ +#if XCHAL_HAVE_LOOPS + loopgtz a10, 4f +#else + beqz a10, 4f + slli a10, a10, 1 + add a10, a10, a2 /* a10 = end of last 2-byte src chunk */ +.Loop7: +#endif +SRC( l16ui a9, a2, 0 ) +DST( s16i a9, a3, 0 ) + ONES_ADD(a5, a9) + addi a2, a2, 2 + addi a3, a3, 2 +#if !XCHAL_HAVE_LOOPS + blt a2, a10, .Loop7 +#endif +4: + /* This section processes a possible trailing odd byte. */ + _bbci.l a4, 0, 8f /* 1-byte chunk */ +SRC( l8ui a9, a2, 0 ) +DST( s8i a9, a3, 0 ) +#ifdef __XTENSA_EB__ + slli a9, a9, 8 /* shift byte to bits 8..15 */ +#endif + ONES_ADD(a5, a9) +8: + mov a2, a5 + retw + +5: + /* Control branch to here when either src or dst is odd. We + process all bytes using 8-bit accesses. Grossly inefficient, + so don't feed us an odd address. */ + + srli a10, a4, 1 /* handle in pairs for 16-bit csum */ +#if XCHAL_HAVE_LOOPS + loopgtz a10, 6f +#else + beqz a10, 6f + slli a10, a10, 1 + add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */ +.Loop8: +#endif +SRC( l8ui a9, a2, 0 ) +SRC( l8ui a8, a2, 1 ) +DST( s8i a9, a3, 0 ) +DST( s8i a8, a3, 1 ) +#ifdef __XTENSA_EB__ + slli a9, a9, 8 /* combine into a single 16-bit value */ +#else /* for checksum computation */ + slli a8, a8, 8 +#endif + or a9, a9, a8 + ONES_ADD(a5, a9) + addi a2, a2, 2 + addi a3, a3, 2 +#if !XCHAL_HAVE_LOOPS + blt a2, a10, .Loop8 +#endif +6: + j 4b /* process the possible trailing odd byte */ + + +# Exception handler: +.section .fixup, "ax" +/* + a6 = src_err_ptr + a7 = dst_err_ptr + a11 = original len for exception handling + a12 = original dst for exception handling +*/ + +6001: + _movi a2, -EFAULT + s32i a2, a6, 0 /* src_err_ptr */ + + # clear the complete destination - computing the rest + # is too much work + movi a2, 0 +#if XCHAL_HAVE_LOOPS + loopgtz a11, 2f +#else + beqz a11, 2f + add a11, a11, a12 /* a11 = ending address */ +.Leloop: +#endif + s8i a2, a12, 0 + addi a12, a12, 1 +#if !XCHAL_HAVE_LOOPS + blt a12, a11, .Leloop +#endif +2: + retw + +6002: + movi a2, -EFAULT + s32i a2, a7, 0 /* dst_err_ptr */ + movi a2, 0 + retw + +.previous + diff --git a/arch/xtensa/lib/memcopy.S b/arch/xtensa/lib/memcopy.S new file mode 100644 index 00000000000..e8f6d7eb722 --- /dev/null +++ b/arch/xtensa/lib/memcopy.S @@ -0,0 +1,315 @@ +/* + * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions + * xthal_memcpy and xthal_bcopy + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2002 - 2005 Tensilica Inc. + */ + +#include <xtensa/coreasm.h> + + .macro src_b r, w0, w1 +#ifdef __XTENSA_EB__ + src \r, \w0, \w1 +#else + src \r, \w1, \w0 +#endif + .endm + + .macro ssa8 r +#ifdef __XTENSA_EB__ + ssa8b \r +#else + ssa8l \r +#endif + .endm + + +/* + * void *memcpy(void *dst, const void *src, size_t len); + * void *memmove(void *dst, const void *src, size_t len); + * void *bcopy(const void *src, void *dst, size_t len); + * + * This function is intended to do the same thing as the standard + * library function memcpy() (or bcopy()) for most cases. + * However, where the source and/or destination references + * an instruction RAM or ROM or a data RAM or ROM, that + * source and/or destination will always be accessed with + * 32-bit load and store instructions (as required for these + * types of devices). + * + * !!!!!!! XTFIXME: + * !!!!!!! Handling of IRAM/IROM has not yet + * !!!!!!! been implemented. + * + * The bcopy version is provided here to avoid the overhead + * of an extra call, for callers that require this convention. + * + * The (general case) algorithm is as follows: + * If destination is unaligned, align it by conditionally + * copying 1 and 2 bytes. + * If source is aligned, + * do 16 bytes with a loop, and then finish up with + * 8, 4, 2, and 1 byte copies conditional on the length; + * else (if source is unaligned), + * do the same, but use SRC to align the source data. + * This code tries to use fall-through branches for the common + * case of aligned source and destination and multiple + * of 4 (or 8) length. + * + * Register use: + * a0/ return address + * a1/ stack pointer + * a2/ return value + * a3/ src + * a4/ length + * a5/ dst + * a6/ tmp + * a7/ tmp + * a8/ tmp + * a9/ tmp + * a10/ tmp + * a11/ tmp + */ + + .text + .align 4 + .global bcopy + .type bcopy,@function +bcopy: + entry sp, 16 # minimal stack frame + # a2=src, a3=dst, a4=len + mov a5, a3 # copy dst so that a2 is return value + mov a3, a2 + mov a2, a5 + j .Lcommon # go to common code for memcpy+bcopy + + +/* + * Byte by byte copy + */ + .align 4 + .byte 0 # 1 mod 4 alignment for LOOPNEZ + # (0 mod 4 alignment for LBEG) +.Lbytecopy: +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lbytecopydone +#else /* !XCHAL_HAVE_LOOPS */ + beqz a4, .Lbytecopydone + add a7, a3, a4 # a7 = end address for source +#endif /* !XCHAL_HAVE_LOOPS */ +.Lnextbyte: + l8ui a6, a3, 0 + addi a3, a3, 1 + s8i a6, a5, 0 + addi a5, a5, 1 +#if !XCHAL_HAVE_LOOPS + blt a3, a7, .Lnextbyte +#endif /* !XCHAL_HAVE_LOOPS */ +.Lbytecopydone: + retw + +/* + * Destination is unaligned + */ + + .align 4 +.Ldst1mod2: # dst is only byte aligned + _bltui a4, 7, .Lbytecopy # do short copies byte by byte + + # copy 1 byte + l8ui a6, a3, 0 + addi a3, a3, 1 + addi a4, a4, -1 + s8i a6, a5, 0 + addi a5, a5, 1 + _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then + # return to main algorithm +.Ldst2mod4: # dst 16-bit aligned + # copy 2 bytes + _bltui a4, 6, .Lbytecopy # do short copies byte by byte + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a3, a3, 2 + addi a4, a4, -2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a5, a5, 2 + j .Ldstaligned # dst is now aligned, return to main algorithm + + .align 4 + .global memcpy + .type memcpy,@function +memcpy: + .global memmove + .type memmove,@function +memmove: + + entry sp, 16 # minimal stack frame + # a2/ dst, a3/ src, a4/ len + mov a5, a2 # copy dst so that a2 is return value +.Lcommon: + _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 + _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 +.Ldstaligned: # return here from .Ldst?mod? once dst is aligned + srli a7, a4, 4 # number of loop iterations with 16B + # per iteration + movi a8, 3 # if source is not aligned, + _bany a3, a8, .Lsrcunaligned # then use shifting copy + /* + * Destination and source are word-aligned, use word copy. + */ + # copy 16 bytes per iteration for word-aligned dst and word-aligned src +#if XCHAL_HAVE_LOOPS + loopnez a7, .Loop1done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .Loop1done + slli a8, a7, 4 + add a8, a8, a3 # a8 = end of last 16B source chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop1: + l32i a6, a3, 0 + l32i a7, a3, 4 + s32i a6, a5, 0 + l32i a6, a3, 8 + s32i a7, a5, 4 + l32i a7, a3, 12 + s32i a6, a5, 8 + addi a3, a3, 16 + s32i a7, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + blt a3, a8, .Loop1 +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop1done: + bbci.l a4, 3, .L2 + # copy 8 bytes + l32i a6, a3, 0 + l32i a7, a3, 4 + addi a3, a3, 8 + s32i a6, a5, 0 + s32i a7, a5, 4 + addi a5, a5, 8 +.L2: + bbsi.l a4, 2, .L3 + bbsi.l a4, 1, .L4 + bbsi.l a4, 0, .L5 + retw +.L3: + # copy 4 bytes + l32i a6, a3, 0 + addi a3, a3, 4 + s32i a6, a5, 0 + addi a5, a5, 4 + bbsi.l a4, 1, .L4 + bbsi.l a4, 0, .L5 + retw +.L4: + # copy 2 bytes + l16ui a6, a3, 0 + addi a3, a3, 2 + s16i a6, a5, 0 + addi a5, a5, 2 + bbsi.l a4, 0, .L5 + retw +.L5: + # copy 1 byte + l8ui a6, a3, 0 + s8i a6, a5, 0 + retw + +/* + * Destination is aligned, Source is unaligned + */ + + .align 4 +.Lsrcunaligned: + _beqz a4, .Ldone # avoid loading anything for zero-length copies + # copy 16 bytes per iteration for word-aligned dst and unaligned src + ssa8 a3 # set shift amount from byte offset +#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS (simulator) with the + lint or ferret client, or 0 to save a few cycles */ +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT + and a11, a3, a8 # save unalignment offset for below + sub a3, a3, a11 # align a3 +#endif + l32i a6, a3, 0 # load first word +#if XCHAL_HAVE_LOOPS + loopnez a7, .Loop2done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .Loop2done + slli a10, a7, 4 + add a10, a10, a3 # a10 = end of last 16B source chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop2: + l32i a7, a3, 4 + l32i a8, a3, 8 + src_b a6, a6, a7 + s32i a6, a5, 0 + l32i a9, a3, 12 + src_b a7, a7, a8 + s32i a7, a5, 4 + l32i a6, a3, 16 + src_b a8, a8, a9 + s32i a8, a5, 8 + addi a3, a3, 16 + src_b a9, a9, a6 + s32i a9, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + blt a3, a10, .Loop2 +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop2done: + bbci.l a4, 3, .L12 + # copy 8 bytes + l32i a7, a3, 4 + l32i a8, a3, 8 + src_b a6, a6, a7 + s32i a6, a5, 0 + addi a3, a3, 8 + src_b a7, a7, a8 + s32i a7, a5, 4 + addi a5, a5, 8 + mov a6, a8 +.L12: + bbci.l a4, 2, .L13 + # copy 4 bytes + l32i a7, a3, 4 + addi a3, a3, 4 + src_b a6, a6, a7 + s32i a6, a5, 0 + addi a5, a5, 4 + mov a6, a7 +.L13: +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT + add a3, a3, a11 # readjust a3 with correct misalignment +#endif + bbsi.l a4, 1, .L14 + bbsi.l a4, 0, .L15 +.Ldone: retw +.L14: + # copy 2 bytes + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a3, a3, 2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a5, a5, 2 + bbsi.l a4, 0, .L15 + retw +.L15: + # copy 1 byte + l8ui a6, a3, 0 + s8i a6, a5, 0 + retw + +/* + * Local Variables: + * mode:fundamental + * comment-start: "# " + * comment-start-skip: "# *" + * End: + */ diff --git a/arch/xtensa/lib/memset.S b/arch/xtensa/lib/memset.S new file mode 100644 index 00000000000..4de25134bc6 --- /dev/null +++ b/arch/xtensa/lib/memset.S @@ -0,0 +1,160 @@ +/* + * arch/xtensa/lib/memset.S + * + * ANSI C standard library function memset + * (Well, almost. .fixup code might return zero.) + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file "COPYING" in the main directory of + * this archive for more details. + * + * Copyright (C) 2002 Tensilica Inc. + */ + +#include <xtensa/coreasm.h> + +/* + * void *memset(void *dst, int c, size_t length) + * + * The algorithm is as follows: + * Create a word with c in all byte positions + * If the destination is aligned, + * do 16B chucks with a loop, and then finish up with + * 8B, 4B, 2B, and 1B stores conditional on the length. + * If destination is unaligned, align it by conditionally + * setting 1B and 2B and then go to aligned case. + * This code tries to use fall-through branches for the common + * case of an aligned destination (except for the branches to + * the alignment labels). + */ + +/* Load or store instructions that may cause exceptions use the EX macro. */ + +#define EX(insn,reg1,reg2,offset,handler) \ +9: insn reg1, reg2, offset; \ + .section __ex_table, "a"; \ + .word 9b, handler; \ + .previous + + +.text +.align 4 +.global memset +.type memset,@function +memset: + entry sp, 16 # minimal stack frame + # a2/ dst, a3/ c, a4/ length + extui a3, a3, 0, 8 # mask to just 8 bits + slli a7, a3, 8 # duplicate character in all bytes of word + or a3, a3, a7 # ... + slli a7, a3, 16 # ... + or a3, a3, a7 # ... + mov a5, a2 # copy dst so that a2 is return value + movi a6, 3 # for alignment tests + bany a2, a6, .Ldstunaligned # if dst is unaligned +.L0: # return here from .Ldstunaligned when dst is aligned + srli a7, a4, 4 # number of loop iterations with 16B + # per iteration + bnez a4, .Laligned + retw + +/* + * Destination is word-aligned. + */ + # set 16 bytes per iteration for word-aligned dst + .align 4 # 1 mod 4 alignment for LOOPNEZ + .byte 0 # (0 mod 4 alignment for LBEG) +.Laligned: +#if XCHAL_HAVE_LOOPS + loopnez a7, .Loop1done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .Loop1done + slli a6, a7, 4 + add a6, a6, a5 # a6 = end of last 16B chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop1: + EX(s32i, a3, a5, 0, memset_fixup) + EX(s32i, a3, a5, 4, memset_fixup) + EX(s32i, a3, a5, 8, memset_fixup) + EX(s32i, a3, a5, 12, memset_fixup) + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + blt a5, a6, .Loop1 +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop1done: + bbci.l a4, 3, .L2 + # set 8 bytes + EX(s32i, a3, a5, 0, memset_fixup) + EX(s32i, a3, a5, 4, memset_fixup) + addi a5, a5, 8 +.L2: + bbci.l a4, 2, .L3 + # set 4 bytes + EX(s32i, a3, a5, 0, memset_fixup) + addi a5, a5, 4 +.L3: + bbci.l a4, 1, .L4 + # set 2 bytes + EX(s16i, a3, a5, 0, memset_fixup) + addi a5, a5, 2 +.L4: + bbci.l a4, 0, .L5 + # set 1 byte + EX(s8i, a3, a5, 0, memset_fixup) +.L5: +.Lret1: + retw + +/* + * Destination is unaligned + */ + +.Ldstunaligned: + bltui a4, 8, .Lbyteset # do short copies byte by byte + bbci.l a5, 0, .L20 # branch if dst alignment half-aligned + # dst is only byte aligned + # set 1 byte + EX(s8i, a3, a5, 0, memset_fixup) + addi a5, a5, 1 + addi a4, a4, -1 + # now retest if dst aligned + bbci.l a5, 1, .L0 # if now aligned, return to main algorithm +.L20: + # dst half-aligned + # set 2 bytes + EX(s16i, a3, a5, 0, memset_fixup) + addi a5, a5, 2 + addi a4, a4, -2 + j .L0 # dst is now aligned, return to main algorithm + +/* + * Byte by byte set + */ + .align 4 + .byte 0 # 1 mod 4 alignment for LOOPNEZ + # (0 mod 4 alignment for LBEG) +.Lbyteset: +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lbytesetdone +#else /* !XCHAL_HAVE_LOOPS */ + beqz a4, .Lbytesetdone + add a6, a5, a4 # a6 = ending address +#endif /* !XCHAL_HAVE_LOOPS */ +.Lbyteloop: + EX(s8i, a3, a5, 0, memset_fixup) + addi a5, a5, 1 +#if !XCHAL_HAVE_LOOPS + blt a5, a6, .Lbyteloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lbytesetdone: + retw + + + .section .fixup, "ax" + .align 4 + +/* We return zero if a failure occurred. */ + +memset_fixup: + movi a2, 0 + retw diff --git a/arch/xtensa/lib/pci-auto.c b/arch/xtensa/lib/pci-auto.c new file mode 100644 index 00000000000..90c790f6123 --- /dev/null +++ b/arch/xtensa/lib/pci-auto.c @@ -0,0 +1,352 @@ +/* + * arch/xtensa/kernel/pci-auto.c + * + * PCI autoconfiguration library + * + * Copyright (C) 2001 - 2005 Tensilica Inc. + * + * Chris Zankel <zankel@tensilica.com, cez@zankel.net> + * + * Based on work from Matt Porter <mporter@mvista.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/pci.h> + +#include <asm/pci-bridge.h> + + +/* + * + * Setting up a PCI + * + * pci_ctrl->first_busno = <first bus number (0)> + * pci_ctrl->last_busno = <last bus number (0xff)> + * pci_ctrl->ops = <PCI config operations> + * pci_ctrl->map_irq = <function to return the interrupt number for a device> + * + * pci_ctrl->io_space.start = <IO space start address (PCI view)> + * pci_ctrl->io_space.end = <IO space end address (PCI view)> + * pci_ctrl->io_space.base = <IO space offset: address 0 from CPU space> + * pci_ctrl->mem_space.start = <MEM space start address (PCI view)> + * pci_ctrl->mem_space.end = <MEM space end address (PCI view)> + * pci_ctrl->mem_space.base = <MEM space offset: address 0 from CPU space> + * + * pcibios_init_resource(&pci_ctrl->io_resource, <IO space start>, + * <IO space end>, IORESOURCE_IO, "PCI host bridge"); + * pcibios_init_resource(&pci_ctrl->mem_resources[0], <MEM space start>, + * <MEM space end>, IORESOURCE_MEM, "PCI host bridge"); + * + * pci_ctrl->last_busno = pciauto_bus_scan(pci_ctrl,pci_ctrl->first_busno); + * + * int __init pciauto_bus_scan(struct pci_controller *pci_ctrl, int current_bus) + * + */ + + +/* define DEBUG to print some debugging messages. */ + +#undef DEBUG + +#ifdef DEBUG +# define DBG(x...) printk(x) +#else +# define DBG(x...) +#endif + +static int pciauto_upper_iospc; +static int pciauto_upper_memspc; + +static struct pci_dev pciauto_dev; +static struct pci_bus pciauto_bus; + +/* + * Helper functions + */ + +/* Initialize the bars of a PCI device. */ + +static void __init +pciauto_setup_bars(struct pci_dev *dev, int bar_limit) +{ + int bar_size; + int bar, bar_nr; + int *upper_limit; + int found_mem64 = 0; + + for (bar = PCI_BASE_ADDRESS_0, bar_nr = 0; + bar <= bar_limit; + bar+=4, bar_nr++) + { + /* Tickle the BAR and get the size */ + pci_write_config_dword(dev, bar, 0xffffffff); + pci_read_config_dword(dev, bar, &bar_size); + + /* If BAR is not implemented go to the next BAR */ + if (!bar_size) + continue; + + /* Check the BAR type and set our address mask */ + if (bar_size & PCI_BASE_ADDRESS_SPACE_IO) + { + bar_size &= PCI_BASE_ADDRESS_IO_MASK; + upper_limit = &pciauto_upper_iospc; + DBG("PCI Autoconfig: BAR %d, I/O, ", bar_nr); + } + else + { + if ((bar_size & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == + PCI_BASE_ADDRESS_MEM_TYPE_64) + found_mem64 = 1; + + bar_size &= PCI_BASE_ADDRESS_MEM_MASK; + upper_limit = &pciauto_upper_memspc; + DBG("PCI Autoconfig: BAR %d, Mem, ", bar_nr); + } + + /* Allocate a base address (bar_size is negative!) */ + *upper_limit = (*upper_limit + bar_size) & bar_size; + + /* Write it out and update our limit */ + pci_write_config_dword(dev, bar, *upper_limit); + + /* + * If we are a 64-bit decoder then increment to the + * upper 32 bits of the bar and force it to locate + * in the lower 4GB of memory. + */ + + if (found_mem64) + pci_write_config_dword(dev, (bar+=4), 0x00000000); + + DBG("size=0x%x, address=0x%x\n", ~bar_size + 1, *upper_limit); + } +} + +/* Initialize the interrupt number. */ + +static void __init +pciauto_setup_irq(struct pci_controller* pci_ctrl,struct pci_dev *dev,int devfn) +{ + u8 pin; + int irq = 0; + + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + + /* Fix illegal pin numbers. */ + + if (pin == 0 || pin > 4) + pin = 1; + + if (pci_ctrl->map_irq) + irq = pci_ctrl->map_irq(dev, PCI_SLOT(devfn), pin); + + if (irq == -1) + irq = 0; + + DBG("PCI Autoconfig: Interrupt %d, pin %d\n", irq, pin); + + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); +} + + +static void __init +pciauto_prescan_setup_bridge(struct pci_dev *dev, int current_bus, + int sub_bus, int *iosave, int *memsave) +{ + /* Configure bus number registers */ + pci_write_config_byte(dev, PCI_PRIMARY_BUS, current_bus); + pci_write_config_byte(dev, PCI_SECONDARY_BUS, sub_bus + 1); + pci_write_config_byte(dev, PCI_SUBORDINATE_BUS, 0xff); + + /* Round memory allocator to 1MB boundary */ + pciauto_upper_memspc &= ~(0x100000 - 1); + *memsave = pciauto_upper_memspc; + + /* Round I/O allocator to 4KB boundary */ + pciauto_upper_iospc &= ~(0x1000 - 1); + *iosave = pciauto_upper_iospc; + + /* Set up memory and I/O filter limits, assume 32-bit I/O space */ + pci_write_config_word(dev, PCI_MEMORY_LIMIT, + ((pciauto_upper_memspc - 1) & 0xfff00000) >> 16); + pci_write_config_byte(dev, PCI_IO_LIMIT, + ((pciauto_upper_iospc - 1) & 0x0000f000) >> 8); + pci_write_config_word(dev, PCI_IO_LIMIT_UPPER16, + ((pciauto_upper_iospc - 1) & 0xffff0000) >> 16); +} + +static void __init +pciauto_postscan_setup_bridge(struct pci_dev *dev, int current_bus, int sub_bus, + int *iosave, int *memsave) +{ + int cmdstat; + + /* Configure bus number registers */ + pci_write_config_byte(dev, PCI_SUBORDINATE_BUS, sub_bus); + + /* + * Round memory allocator to 1MB boundary. + * If no space used, allocate minimum. + */ + pciauto_upper_memspc &= ~(0x100000 - 1); + if (*memsave == pciauto_upper_memspc) + pciauto_upper_memspc -= 0x00100000; + + pci_write_config_word(dev, PCI_MEMORY_BASE, pciauto_upper_memspc >> 16); + + /* Allocate 1MB for pre-fretch */ + pci_write_config_word(dev, PCI_PREF_MEMORY_LIMIT, + ((pciauto_upper_memspc - 1) & 0xfff00000) >> 16); + + pciauto_upper_memspc -= 0x100000; + + pci_write_config_word(dev, PCI_PREF_MEMORY_BASE, + pciauto_upper_memspc >> 16); + + /* Round I/O allocator to 4KB boundary */ + pciauto_upper_iospc &= ~(0x1000 - 1); + if (*iosave == pciauto_upper_iospc) + pciauto_upper_iospc -= 0x1000; + + pci_write_config_byte(dev, PCI_IO_BASE, + (pciauto_upper_iospc & 0x0000f000) >> 8); + pci_write_config_word(dev, PCI_IO_BASE_UPPER16, + pciauto_upper_iospc >> 16); + + /* Enable memory and I/O accesses, enable bus master */ + pci_read_config_dword(dev, PCI_COMMAND, &cmdstat); + pci_write_config_dword(dev, PCI_COMMAND, + cmdstat | + PCI_COMMAND_IO | + PCI_COMMAND_MEMORY | + PCI_COMMAND_MASTER); +} + +/* + * Scan the current PCI bus. + */ + + +int __init pciauto_bus_scan(struct pci_controller *pci_ctrl, int current_bus) +{ + int sub_bus, pci_devfn, pci_class, cmdstat, found_multi=0; + unsigned short vid; + unsigned char header_type; + struct pci_dev *dev = &pciauto_dev; + + pciauto_dev.bus = &pciauto_bus; + pciauto_dev.sysdata = pci_ctrl; + pciauto_bus.ops = pci_ctrl->ops; + |