diff options
author | Paul Mackerras <paulus@samba.org> | 2005-09-26 16:04:21 +1000 |
---|---|---|
committer | Paul Mackerras <paulus@samba.org> | 2005-09-26 16:04:21 +1000 |
commit | 14cf11af6cf608eb8c23e989ddb17a715ddce109 (patch) | |
tree | 271a97ce73e265f39c569cb159c195c5b4bb3f8c /arch/powerpc/lib | |
parent | e5baa396af7560382d2cf3f0871d616b61fc284c (diff) |
powerpc: Merge enough to start building in arch/powerpc.
This creates the directory structure under arch/powerpc and a bunch
of Kconfig files. It does a first-cut merge of arch/powerpc/mm,
arch/powerpc/lib and arch/powerpc/platforms/powermac. This is enough
to build a 32-bit powermac kernel with ARCH=powerpc.
For now we are getting some unmerged files from arch/ppc/kernel and
arch/ppc/syslib, or arch/ppc64/kernel. This makes some minor changes
to files in those directories and files outside arch/powerpc.
The boot directory is still not merged. That's going to be interesting.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r-- | arch/powerpc/lib/Makefile | 9 | ||||
-rw-r--r-- | arch/powerpc/lib/checksum.S | 225 | ||||
-rw-r--r-- | arch/powerpc/lib/checksum64.S | 229 | ||||
-rw-r--r-- | arch/powerpc/lib/copy32.S | 543 | ||||
-rw-r--r-- | arch/powerpc/lib/copypage.S | 121 | ||||
-rw-r--r-- | arch/powerpc/lib/copyuser.S | 576 | ||||
-rw-r--r-- | arch/powerpc/lib/div64.S | 58 | ||||
-rw-r--r-- | arch/powerpc/lib/e2a.c | 108 | ||||
-rw-r--r-- | arch/powerpc/lib/memcpy.S | 172 | ||||
-rw-r--r-- | arch/powerpc/lib/rheap.c | 693 | ||||
-rw-r--r-- | arch/powerpc/lib/sstep.c | 141 | ||||
-rw-r--r-- | arch/powerpc/lib/strcase.c | 23 | ||||
-rw-r--r-- | arch/powerpc/lib/string.S | 203 | ||||
-rw-r--r-- | arch/powerpc/lib/usercopy.c | 41 |
14 files changed, 3142 insertions, 0 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile new file mode 100644 index 00000000000..347f9798e43 --- /dev/null +++ b/arch/powerpc/lib/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for ppc-specific library files.. +# + +obj-y := strcase.o string.o +obj-$(CONFIG_PPC32) += div64.o copy32.o checksum.o +obj-$(CONFIG_PPC64) += copypage.o copyuser.o memcpy.o usercopy.o \ + sstep.o checksum64.o +obj-$(CONFIG_PPC_ISERIES) += e2a.o diff --git a/arch/powerpc/lib/checksum.S b/arch/powerpc/lib/checksum.S new file mode 100644 index 00000000000..7874e8a8045 --- /dev/null +++ b/arch/powerpc/lib/checksum.S @@ -0,0 +1,225 @@ +/* + * This file contains assembly-language implementations + * of IP-style 1's complement checksum routines. + * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). + */ + +#include <linux/sys.h> +#include <asm/processor.h> +#include <asm/errno.h> +#include <asm/ppc_asm.h> + + .text + +/* + * ip_fast_csum(buf, len) -- Optimized for IP header + * len is in words and is always >= 5. + */ +_GLOBAL(ip_fast_csum) + lwz r0,0(r3) + lwzu r5,4(r3) + addic. r4,r4,-2 + addc r0,r0,r5 + mtctr r4 + blelr- +1: lwzu r4,4(r3) + adde r0,r0,r4 + bdnz 1b + addze r0,r0 /* add in final carry */ + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * Compute checksum of TCP or UDP pseudo-header: + * csum_tcpudp_magic(saddr, daddr, len, proto, sum) + */ +_GLOBAL(csum_tcpudp_magic) + rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ + addc r0,r3,r4 /* add 4 32-bit words together */ + adde r0,r0,r5 + adde r0,r0,r7 + addze r0,r0 /* add in final carry */ + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * csum_partial(buff, len, sum) + */ +_GLOBAL(csum_partial) + addic r0,r5,0 + subi r3,r3,4 + srwi. r6,r4,2 + beq 3f /* if we're doing < 4 bytes */ + andi. r5,r3,2 /* Align buffer to longword boundary */ + beq+ 1f + lhz r5,4(r3) /* do 2 bytes to get aligned */ + addi r3,r3,2 + subi r4,r4,2 + addc r0,r0,r5 + srwi. r6,r4,2 /* # words to do */ + beq 3f +1: mtctr r6 +2: lwzu r5,4(r3) /* the bdnz has zero overhead, so it should */ + adde r0,r0,r5 /* be unnecessary to unroll this loop */ + bdnz 2b + andi. r4,r4,3 +3: cmpwi 0,r4,2 + blt+ 4f + lhz r5,4(r3) + addi r3,r3,2 + subi r4,r4,2 + adde r0,r0,r5 +4: cmpwi 0,r4,1 + bne+ 5f + lbz r5,4(r3) + slwi r5,r5,8 /* Upper byte of word */ + adde r0,r0,r5 +5: addze r3,r0 /* add in final carry */ + blr + +/* + * Computes the checksum of a memory block at src, length len, + * and adds in "sum" (32-bit), while copying the block to dst. + * If an access exception occurs on src or dst, it stores -EFAULT + * to *src_err or *dst_err respectively, and (for an error on + * src) zeroes the rest of dst. + * + * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) + */ +_GLOBAL(csum_partial_copy_generic) + addic r0,r6,0 + subi r3,r3,4 + subi r4,r4,4 + srwi. r6,r5,2 + beq 3f /* if we're doing < 4 bytes */ + andi. r9,r4,2 /* Align dst to longword boundary */ + beq+ 1f +81: lhz r6,4(r3) /* do 2 bytes to get aligned */ + addi r3,r3,2 + subi r5,r5,2 +91: sth r6,4(r4) + addi r4,r4,2 + addc r0,r0,r6 + srwi. r6,r5,2 /* # words to do */ + beq 3f +1: srwi. r6,r5,4 /* # groups of 4 words to do */ + beq 10f + mtctr r6 +71: lwz r6,4(r3) +72: lwz r9,8(r3) +73: lwz r10,12(r3) +74: lwzu r11,16(r3) + adde r0,r0,r6 +75: stw r6,4(r4) + adde r0,r0,r9 +76: stw r9,8(r4) + adde r0,r0,r10 +77: stw r10,12(r4) + adde r0,r0,r11 +78: stwu r11,16(r4) + bdnz 71b +10: rlwinm. r6,r5,30,30,31 /* # words left to do */ + beq 13f + mtctr r6 +82: lwzu r9,4(r3) +92: stwu r9,4(r4) + adde r0,r0,r9 + bdnz 82b +13: andi. r5,r5,3 +3: cmpwi 0,r5,2 + blt+ 4f +83: lhz r6,4(r3) + addi r3,r3,2 + subi r5,r5,2 +93: sth r6,4(r4) + addi r4,r4,2 + adde r0,r0,r6 +4: cmpwi 0,r5,1 + bne+ 5f +84: lbz r6,4(r3) +94: stb r6,4(r4) + slwi r6,r6,8 /* Upper byte of word */ + adde r0,r0,r6 +5: addze r3,r0 /* add in final carry */ + blr + +/* These shouldn't go in the fixup section, since that would + cause the ex_table addresses to get out of order. */ + +src_error_4: + mfctr r6 /* update # bytes remaining from ctr */ + rlwimi r5,r6,4,0,27 + b 79f +src_error_1: + li r6,0 + subi r5,r5,2 +95: sth r6,4(r4) + addi r4,r4,2 +79: srwi. r6,r5,2 + beq 3f + mtctr r6 +src_error_2: + li r6,0 +96: stwu r6,4(r4) + bdnz 96b +3: andi. r5,r5,3 + beq src_error +src_error_3: + li r6,0 + mtctr r5 + addi r4,r4,3 +97: stbu r6,1(r4) + bdnz 97b +src_error: + cmpwi 0,r7,0 + beq 1f + li r6,-EFAULT + stw r6,0(r7) +1: addze r3,r0 + blr + +dst_error: + cmpwi 0,r8,0 + beq 1f + li r6,-EFAULT + stw r6,0(r8) +1: addze r3,r0 + blr + +.section __ex_table,"a" + .long 81b,src_error_1 + .long 91b,dst_error + .long 71b,src_error_4 + .long 72b,src_error_4 + .long 73b,src_error_4 + .long 74b,src_error_4 + .long 75b,dst_error + .long 76b,dst_error + .long 77b,dst_error + .long 78b,dst_error + .long 82b,src_error_2 + .long 92b,dst_error + .long 83b,src_error_3 + .long 93b,dst_error + .long 84b,src_error_3 + .long 94b,dst_error + .long 95b,dst_error + .long 96b,dst_error + .long 97b,dst_error diff --git a/arch/powerpc/lib/checksum64.S b/arch/powerpc/lib/checksum64.S new file mode 100644 index 00000000000..ef96c6c58ef --- /dev/null +++ b/arch/powerpc/lib/checksum64.S @@ -0,0 +1,229 @@ +/* + * This file contains assembly-language implementations + * of IP-style 1's complement checksum routines. + * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). + */ + +#include <linux/sys.h> +#include <asm/processor.h> +#include <asm/errno.h> +#include <asm/ppc_asm.h> + +/* + * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header + * len is in words and is always >= 5. + * + * In practice len == 5, but this is not guaranteed. So this code does not + * attempt to use doubleword instructions. + */ +_GLOBAL(ip_fast_csum) + lwz r0,0(r3) + lwzu r5,4(r3) + addic. r4,r4,-2 + addc r0,r0,r5 + mtctr r4 + blelr- +1: lwzu r4,4(r3) + adde r0,r0,r4 + bdnz 1b + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32-bit halves together */ + add r0,r0,r4 + srdi r0,r0,32 + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * Compute checksum of TCP or UDP pseudo-header: + * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum) + * No real gain trying to do this specially for 64 bit, but + * the 32 bit addition may spill into the upper bits of + * the doubleword so we still must fold it down from 64. + */ +_GLOBAL(csum_tcpudp_magic) + rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ + addc r0,r3,r4 /* add 4 32-bit words together */ + adde r0,r0,r5 + adde r0,r0,r7 + rldicl r4,r0,32,0 /* fold 64 bit value */ + add r0,r4,r0 + srdi r0,r0,32 + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * Computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit). + * + * This code assumes at least halfword alignment, though the length + * can be any number of bytes. The sum is accumulated in r5. + * + * csum_partial(r3=buff, r4=len, r5=sum) + */ +_GLOBAL(csum_partial) + subi r3,r3,8 /* we'll offset by 8 for the loads */ + srdi. r6,r4,3 /* divide by 8 for doubleword count */ + addic r5,r5,0 /* clear carry */ + beq 3f /* if we're doing < 8 bytes */ + andi. r0,r3,2 /* aligned on a word boundary already? */ + beq+ 1f + lhz r6,8(r3) /* do 2 bytes to get aligned */ + addi r3,r3,2 + subi r4,r4,2 + addc r5,r5,r6 + srdi. r6,r4,3 /* recompute number of doublewords */ + beq 3f /* any left? */ +1: mtctr r6 +2: ldu r6,8(r3) /* main sum loop */ + adde r5,r5,r6 + bdnz 2b + andi. r4,r4,7 /* compute bytes left to sum after doublewords */ +3: cmpwi 0,r4,4 /* is at least a full word left? */ + blt 4f + lwz r6,8(r3) /* sum this word */ + addi r3,r3,4 + subi r4,r4,4 + adde r5,r5,r6 +4: cmpwi 0,r4,2 /* is at least a halfword left? */ + blt+ 5f + lhz r6,8(r3) /* sum this halfword */ + addi r3,r3,2 + subi r4,r4,2 + adde r5,r5,r6 +5: cmpwi 0,r4,1 /* is at least a byte left? */ + bne+ 6f + lbz r6,8(r3) /* sum this byte */ + slwi r6,r6,8 /* this byte is assumed to be the upper byte of a halfword */ + adde r5,r5,r6 +6: addze r5,r5 /* add in final carry */ + rldicl r4,r5,32,0 /* fold two 32-bit halves together */ + add r3,r4,r5 + srdi r3,r3,32 + blr + +/* + * Computes the checksum of a memory block at src, length len, + * and adds in "sum" (32-bit), while copying the block to dst. + * If an access exception occurs on src or dst, it stores -EFAULT + * to *src_err or *dst_err respectively, and (for an error on + * src) zeroes the rest of dst. + * + * This code needs to be reworked to take advantage of 64 bit sum+copy. + * However, due to tokenring halfword alignment problems this will be very + * tricky. For now we'll leave it until we instrument it somehow. + * + * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) + */ +_GLOBAL(csum_partial_copy_generic) + addic r0,r6,0 + subi r3,r3,4 + subi r4,r4,4 + srwi. r6,r5,2 + beq 3f /* if we're doing < 4 bytes */ + andi. r9,r4,2 /* Align dst to longword boundary */ + beq+ 1f +81: lhz r6,4(r3) /* do 2 bytes to get aligned */ + addi r3,r3,2 + subi r5,r5,2 +91: sth r6,4(r4) + addi r4,r4,2 + addc r0,r0,r6 + srwi. r6,r5,2 /* # words to do */ + beq 3f +1: mtctr r6 +82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */ +92: stwu r6,4(r4) /* be unnecessary to unroll this loop */ + adde r0,r0,r6 + bdnz 82b + andi. r5,r5,3 +3: cmpwi 0,r5,2 + blt+ 4f +83: lhz r6,4(r3) + addi r3,r3,2 + subi r5,r5,2 +93: sth r6,4(r4) + addi r4,r4,2 + adde r0,r0,r6 +4: cmpwi 0,r5,1 + bne+ 5f +84: lbz r6,4(r3) +94: stb r6,4(r4) + slwi r6,r6,8 /* Upper byte of word */ + adde r0,r0,r6 +5: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */ + rldicl r4,r3,32,0 /* fold 64 bit value */ + add r3,r4,r3 + srdi r3,r3,32 + blr + +/* These shouldn't go in the fixup section, since that would + cause the ex_table addresses to get out of order. */ + + .globl src_error_1 +src_error_1: + li r6,0 + subi r5,r5,2 +95: sth r6,4(r4) + addi r4,r4,2 + srwi. r6,r5,2 + beq 3f + mtctr r6 + .globl src_error_2 +src_error_2: + li r6,0 +96: stwu r6,4(r4) + bdnz 96b +3: andi. r5,r5,3 + beq src_error + .globl src_error_3 +src_error_3: + li r6,0 + mtctr r5 + addi r4,r4,3 +97: stbu r6,1(r4) + bdnz 97b + .globl src_error +src_error: + cmpdi 0,r7,0 + beq 1f + li r6,-EFAULT + stw r6,0(r7) +1: addze r3,r0 + blr + + .globl dst_error +dst_error: + cmpdi 0,r8,0 + beq 1f + li r6,-EFAULT + stw r6,0(r8) +1: addze r3,r0 + blr + +.section __ex_table,"a" + .align 3 + .llong 81b,src_error_1 + .llong 91b,dst_error + .llong 82b,src_error_2 + .llong 92b,dst_error + .llong 83b,src_error_3 + .llong 93b,dst_error + .llong 84b,src_error_3 + .llong 94b,dst_error + .llong 95b,dst_error + .llong 96b,dst_error + .llong 97b,dst_error diff --git a/arch/powerpc/lib/copy32.S b/arch/powerpc/lib/copy32.S new file mode 100644 index 00000000000..420a912198a --- /dev/null +++ b/arch/powerpc/lib/copy32.S @@ -0,0 +1,543 @@ +/* + * Memory copy functions for 32-bit PowerPC. + * + * Copyright (C) 1996-2005 Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/config.h> +#include <asm/processor.h> +#include <asm/cache.h> +#include <asm/errno.h> +#include <asm/ppc_asm.h> + +#define COPY_16_BYTES \ + lwz r7,4(r4); \ + lwz r8,8(r4); \ + lwz r9,12(r4); \ + lwzu r10,16(r4); \ + stw r7,4(r6); \ + stw r8,8(r6); \ + stw r9,12(r6); \ + stwu r10,16(r6) + +#define COPY_16_BYTES_WITHEX(n) \ +8 ## n ## 0: \ + lwz r7,4(r4); \ +8 ## n ## 1: \ + lwz r8,8(r4); \ +8 ## n ## 2: \ + lwz r9,12(r4); \ +8 ## n ## 3: \ + lwzu r10,16(r4); \ +8 ## n ## 4: \ + stw r7,4(r6); \ +8 ## n ## 5: \ + stw r8,8(r6); \ +8 ## n ## 6: \ + stw r9,12(r6); \ +8 ## n ## 7: \ + stwu r10,16(r6) + +#define COPY_16_BYTES_EXCODE(n) \ +9 ## n ## 0: \ + addi r5,r5,-(16 * n); \ + b 104f; \ +9 ## n ## 1: \ + addi r5,r5,-(16 * n); \ + b 105f; \ +.section __ex_table,"a"; \ + .align 2; \ + .long 8 ## n ## 0b,9 ## n ## 0b; \ + .long 8 ## n ## 1b,9 ## n ## 0b; \ + .long 8 ## n ## 2b,9 ## n ## 0b; \ + .long 8 ## n ## 3b,9 ## n ## 0b; \ + .long 8 ## n ## 4b,9 ## n ## 1b; \ + .long 8 ## n ## 5b,9 ## n ## 1b; \ + .long 8 ## n ## 6b,9 ## n ## 1b; \ + .long 8 ## n ## 7b,9 ## n ## 1b; \ + .text + + .text + .stabs "arch/powerpc/lib/",N_SO,0,0,0f + .stabs "copy32.S",N_SO,0,0,0f +0: + +CACHELINE_BYTES = L1_CACHE_LINE_SIZE +LG_CACHELINE_BYTES = LG_L1_CACHE_LINE_SIZE +CACHELINE_MASK = (L1_CACHE_LINE_SIZE-1) + +/* + * Use dcbz on the complete cache lines in the destination + * to set them to zero. This requires that the destination + * area is cacheable. -- paulus + */ +_GLOBAL(cacheable_memzero) + mr r5,r4 + li r4,0 + addi r6,r3,-4 + cmplwi 0,r5,4 + blt 7f + stwu r4,4(r6) + beqlr + andi. r0,r6,3 + add r5,r0,r5 + subf r6,r0,r6 + clrlwi r7,r6,32-LG_CACHELINE_BYTES + add r8,r7,r5 + srwi r9,r8,LG_CACHELINE_BYTES + addic. r9,r9,-1 /* total number of complete cachelines */ + ble 2f + xori r0,r7,CACHELINE_MASK & ~3 + srwi. r0,r0,2 + beq 3f + mtctr r0 +4: stwu r4,4(r6) + bdnz 4b +3: mtctr r9 + li r7,4 +#if !defined(CONFIG_8xx) +10: dcbz r7,r6 +#else +10: stw r4, 4(r6) + stw r4, 8(r6) + stw r4, 12(r6) + stw r4, 16(r6) +#if CACHE_LINE_SIZE >= 32 + stw r4, 20(r6) + stw r4, 24(r6) + stw r4, 28(r6) + stw r4, 32(r6) +#endif /* CACHE_LINE_SIZE */ +#endif + addi r6,r6,CACHELINE_BYTES + bdnz 10b + clrlwi r5,r8,32-LG_CACHELINE_BYTES + addi r5,r5,4 +2: srwi r0,r5,2 + mtctr r0 + bdz 6f +1: stwu r4,4(r6) + bdnz 1b +6: andi. r5,r5,3 +7: cmpwi 0,r5,0 + beqlr + mtctr r5 + addi r6,r6,3 +8: stbu r4,1(r6) + bdnz 8b + blr + +_GLOBAL(memset) + rlwimi r4,r4,8,16,23 + rlwimi r4,r4,16,0,15 + addi r6,r3,-4 + cmplwi 0,r5,4 + blt 7f + stwu r4,4(r6) + beqlr + andi. r0,r6,3 + add r5,r0,r5 + subf r6,r0,r6 + srwi r0,r5,2 + mtctr r0 + bdz 6f +1: stwu r4,4(r6) + bdnz 1b +6: andi. r5,r5,3 +7: cmpwi 0,r5,0 + beqlr + mtctr r5 + addi r6,r6,3 +8: stbu r4,1(r6) + bdnz 8b + blr + +/* + * This version uses dcbz on the complete cache lines in the + * destination area to reduce memory traffic. This requires that + * the destination area is cacheable. + * We only use this version if the source and dest don't overlap. + * -- paulus. + */ +_GLOBAL(cacheable_memcpy) + add r7,r3,r5 /* test if the src & dst overlap */ + add r8,r4,r5 + cmplw 0,r4,r7 + cmplw 1,r3,r8 + crand 0,0,4 /* cr0.lt &= cr1.lt */ + blt memcpy /* if regions overlap */ + + addi r4,r4,-4 + addi r6,r3,-4 + neg r0,r3 + andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ + beq 58f + + cmplw 0,r5,r0 /* is this more than total to do? */ + blt 63f /* if not much to do */ + andi. r8,r0,3 /* get it word-aligned first */ + subf r5,r0,r5 + mtctr r8 + beq+ 61f +70: lbz r9,4(r4) /* do some bytes */ + stb r9,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 70b +61: srwi. r0,r0,2 + mtctr r0 + beq 58f +72: lwzu r9,4(r4) /* do some words */ + stwu r9,4(r6) + bdnz 72b + +58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ + clrlwi r5,r5,32-LG_CACHELINE_BYTES + li r11,4 + mtctr r0 + beq 63f +53: +#if !defined(CONFIG_8xx) + dcbz r11,r6 +#endif + COPY_16_BYTES +#if L1_CACHE_LINE_SIZE >= 32 + COPY_16_BYTES +#if L1_CACHE_LINE_SIZE >= 64 + COPY_16_BYTES + COPY_16_BYTES +#if L1_CACHE_LINE_SIZE >= 128 + COPY_16_BYTES + COPY_16_BYTES + COPY_16_BYTES + COPY_16_BYTES +#endif +#endif +#endif + bdnz 53b + +63: srwi. r0,r5,2 + mtctr r0 + beq 64f +30: lwzu r0,4(r4) + stwu r0,4(r6) + bdnz 30b + +64: andi. r0,r5,3 + mtctr r0 + beq+ 65f +40: lbz r0,4(r4) + stb r0,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 40b +65: blr + +_GLOBAL(memmove) + cmplw 0,r3,r4 + bgt backwards_memcpy + /* fall through */ + +_GLOBAL(memcpy) + srwi. r7,r5,3 + addi r6,r3,-4 + addi r4,r4,-4 + beq 2f /* if less than 8 bytes to do */ + andi. r0,r6,3 /* get dest word aligned */ + mtctr r7 + bne 5f +1: lwz r7,4(r4) + lwzu r8,8(r4) + stw r7,4(r6) + stwu r8,8(r6) + bdnz 1b + andi. r5,r5,7 +2: cmplwi 0,r5,4 + blt 3f + lwzu r0,4(r4) + addi r5,r5,-4 + stwu r0,4(r6) +3: cmpwi 0,r5,0 + beqlr + mtctr r5 + addi r4,r4,3 + addi r6,r6,3 +4: lbzu r0,1(r4) + stbu r0,1(r6) + bdnz 4b + blr +5: subfic r0,r0,4 + mtctr r0 +6: lbz r7,4(r4) + addi r4,r4,1 + stb r7,4(r6) + addi r6,r6,1 + bdnz 6b + subf r5,r0,r5 + rlwinm. r7,r5,32-3,3,31 + beq 2b + mtctr r7 + b 1b + +_GLOBAL(backwards_memcpy) + rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ + add r6,r3,r5 + add r4,r4,r5 + beq 2f + andi. r0,r6,3 + mtctr r7 + bne 5f +1: lwz r7,-4(r4) + lwzu r8,-8(r4) + stw r7,-4(r6) + stwu r8,-8(r6) + bdnz 1b + andi. r5,r5,7 +2: cmplwi 0,r5,4 + blt 3f + lwzu r0,-4(r4) + subi r5,r5,4 + stwu r0,-4(r6) +3: cmpwi 0,r5,0 + beqlr + mtctr r5 +4: lbzu r0,-1(r4) + stbu r0,-1(r6) + bdnz 4b + blr +5: mtctr r0 +6: lbzu r7,-1(r4) + stbu r7,-1(r6) + bdnz 6b + subf r5,r0,r5 + rlwinm. r7,r5,32-3,3,31 + beq 2b + mtctr r7 + b 1b + +_GLOBAL(__copy_tofrom_user) + addi r4,r4,-4 + addi r6,r3,-4 + neg r0,r3 + andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ + beq 58f + + cmplw 0,r5,r0 /* is this more than total to do? */ + blt 63f /* if not much to do */ + andi. r8,r0,3 /* get it word-aligned first */ + mtctr r8 + beq+ 61f +70: lbz r9,4(r4) /* do some bytes */ +71: stb r9,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 70b +61: subf r5,r0,r5 + srwi. r0,r0,2 + mtctr r0 + beq 58f +72: lwzu r9,4(r4) /* do some words */ +73: stwu r9,4(r6) + bdnz 72b + + .section __ex_table,"a" + .align 2 + .long 70b,100f + .long 71b,101f + .long 72b,102f + .long 73b,103f + .text + +58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ + clrlwi r5,r5,32-LG_CACHELINE_BYTES + li r11,4 + beq 63f + +#ifdef CONFIG_8xx + /* Don't use prefetch on 8xx */ + mtctr r0 + li r0,0 +53: COPY_16_BYTES_WITHEX(0) + bdnz 53b + +#else /* not CONFIG_8xx */ + /* Here we decide how far ahead to prefetch the source */ + li r3,4 + cmpwi r0,1 + li r7,0 + ble 114f + li r7,1 +#if MAX_COPY_PREFETCH > 1 + /* Heuristically, for large transfers we prefetch + MAX_COPY_PREFETCH cachelines ahead. For small transfers + we prefetch 1 cacheline ahead. */ + cmpwi r0,MAX_COPY_PREFETCH + ble 112f + li r7,MAX_COPY_PREFETCH +112: mtctr r7 +111: dcbt r3,r4 + addi r3,r3,CACHELINE_BYTES + bdnz 111b +#else + dcbt r3,r4 + addi r3,r3,CACHELINE_BYTES +#endif /* MAX_COPY_PREFETCH > 1 */ + +114: subf r8,r7,r0 + mr r0,r7 + mtctr r8 + +53: dcbt r3,r4 +54: dcbz r11,r6 + .section __ex_table,"a" + .align 2 + .long 54b,105f + .text +/* the main body of the cacheline loop */ + COPY_16_BYTES_WITHEX(0) +#if L1_CACHE_LINE_SIZE >= 32 + COPY_16_BYTES_WITHEX(1) +#if L1_CACHE_LINE_SIZE >= 64 + COPY_16_BYTES_WITHEX(2) + COPY_16_BYTES_WITHEX(3) +#if L1_CACHE_LINE_SIZE >= 128 + COPY_16_BYTES_WITHEX(4) + COPY_16_BYTES_WITHEX(5) + COPY_16_BYTES_WITHEX(6) + COPY_16_BYTES_WITHEX(7) +#endif +#endif +#endif + bdnz 53b + cmpwi r0,0 + li r3,4 + li r7,0 + bne 114b +#endif /* CONFIG_8xx */ + +63: srwi. r0,r5,2 + mtctr r0 + beq 64f +30: lwzu r0,4(r4) +31: stwu r0,4(r6) + bdnz 30b + +64: andi. r0,r5,3 + mtctr r0 + beq+ 65f +40: lbz r0,4(r4) +41: stb r0,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 40b +65: li r3,0 + blr + +/* read fault, initial single-byte copy */ +100: li r9,0 + b 90f +/* write fault, initial single-byte copy */ +101: li r9,1 +90: subf r5,r8,r5 + li r3,0 + b 99f +/* read fault, initial word copy */ +102: li r9,0 + b 91f +/* write fault, initial word copy */ +103: li r9,1 +91: li r3,2 + b 99f + +/* + * this stuff handles faults in the cacheline loop and branches to either + * 104f (if in read part) or 105f (if in write part), after updating r5 + */ + COPY_16_BYTES_EXCODE(0) +#if L1_CACHE_LINE_SIZE >= 32 + COPY_16_BYTES_EXCODE(1) +#if L1_CACHE_LINE_SIZE >= 64 + COPY_16_BYTES_EXCODE(2) + COPY_16_BYTES_EXCODE(3) +#if L1_CACHE_LINE_SIZE >= 128 + COPY_16_BYTES_EXCODE(4) + COPY_16_BYTES_EXCODE(5) + COPY_16_BYTES_EXCODE(6) + COPY_16_BYTES_EXCODE(7) +#endif +#endif +#endif + +/* read fault in cacheline loop */ +104: li r9,0 + b 92f +/* fault on dcbz (effectively a write fault) */ +/* or write fault in cacheline loop */ +105: li r9,1 +92: li r3,LG_CACHELINE_BYTES + mfctr r8 + add r0,r0,r8 + b 106f +/* read fault in final word loop */ +108: li r9,0 + b 93f +/* write fault in final word loop */ +109: li r9,1 +93: andi. r5,r5,3 + li r3,2 + b 99f +/* read fault in final byte loop */ +110: li r9,0 + b 94f +/* write fault in final byte loop */ +111: li r9,1 +94: li r5,0 + li r3,0 +/* + * At this stage the number of bytes not copied is + * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. + */ +99: mfctr r0 +106: slw r3,r0,r3 + add. r3,r3,r5 + beq 120f /* shouldn't happen */ + cmpwi 0,r9,0 + bne 120f +/* for a read fault, first try to continue the copy one byte at a time */ + mtctr r3 +130: lbz r0,4(r4) +131: stb r0,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 130b +/* then clear out the destination: r3 bytes starting at 4(r6) */ +132: mfctr r3 + srwi. r0,r3,2 + li r9,0 + mtctr r0 + beq 113f +112: stwu r9,4(r6) + bdnz 112b +113: andi. r0,r3,3 + mtctr r0 + beq 120f +114: stb r9,4(r6) + addi r6,r6,1 + bdnz 114b +120: blr + + .section __ex_table,"a" + .align 2 + .long 30b,108b + .long 31b,109b + .long 40b,110b + .long 41b,111b + .long 130b,132b + .long 131b,120b + .long 112b,120b + .long 114b,120b + .text diff --git a/arch/powerpc/lib/copypage.S b/arch/powerpc/lib/copypage.S new file mode 100644 index 00000000000..733d61618bb --- /dev/null +++ b/arch/powerpc/lib/copypage.S @@ -0,0 +1,121 @@ +/* + * arch/ppc64/lib/copypage.S + * + * Copyright (C) 2002 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> + +_GLOBAL(copy_page) + std r31,-8(1) + std r30,-16(1) + std r29,-24(1) + std r28,-32(1) + std r27,-40(1) + std r26,-48(1) + std r25,-56(1) + std r24,-64(1) + std r23,-72(1) + std r22,-80(1) + std r21,-88(1) + std r20,-96(1) + li r5,4096/32 - 1 + addi r3,r3,-8 + li r12,5 +0: addi r5,r5,-24 + mtctr r12 + ld r22,640(4) + ld r21,512(4) + ld r20,384(4) + ld r11,256(4) + ld r9,128(4) + ld r7,0(4) + ld r25,648(4) + ld r24,520(4) + ld r23,392(4) + ld r10,264(4) + ld r8,136(4) + ldu r6,8(4) + cmpwi r5,24 +1: std r22,648(3) + std r21,520(3) + std r20,392(3) + std r11,264(3) + std r9,136(3) + std r7,8(3) + ld r28,648(4) + ld r27,520(4) + ld r26,392(4) + ld r31,264(4) + ld r30,136(4) + ld r29,8(4) + std r25,656(3) + std r24,528(3) + std r23,400(3) + std r10,272(3) + std r8,144(3) + std r6,16(3) + ld r22,656(4) + ld r21,528(4) + ld r20,400(4) + ld r11,272(4) + ld r9,144(4) + ld r7,16(4) + std r28,664(3) + std r27,536(3) + std r26,408(3) + std r31,280(3) + std r30,152(3) + stdu r29,24(3) + ld r25,664(4) + ld r24,536(4) + ld r23,408(4) + ld r10,280(4) + ld r8,152(4) + ldu r6,24(4) + bdnz 1b + std r22,648(3) + std r21,520(3) + std r20,392(3) + std r11,264(3) + std r9,136(3) + std r7,8(3) + addi r4,r4,640 + addi r3,r3,648 + bge 0b + mtctr r5 + ld r7,0(4) + ld r8,8(4) + ldu r9,16(4) +3: ld r10,8(4) + std r7,8(3) + ld r7,16(4) + std r8,16(3) + ld r8,24(4) + std r9,24(3) + ldu r9,32(4) + stdu r10,32(3) + bdnz 3b +4: ld r10,8(4) + std r7,8(3) + std r8,16(3) + std r9,24(3) + std r10,32(3) +9: ld r20,-96(1) + ld r21,-88(1) + ld r22,-80(1) + ld r23,-72(1) + ld r24,-64(1) + ld r25,-56(1) + ld r26,-48(1) + ld r27,-40(1) + ld r28,-32(1) + ld r29,-24(1) + ld r30,-16(1) + ld r31,-8(1) + blr diff --git a/arch/powerpc/lib/copyuser.S b/arch/powerpc/lib/copyuser.S new file mode 100644 index 00000000000..a0b3fbbd6fb --- /dev/null +++ b/arch/powerpc/lib/copyuser.S @@ -0,0 +1,576 @@ +/* + * arch/ppc64/lib/copyuser.S + * + * Copyright (C) 2002 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> + + .align 7 +_GLOBAL(__copy_tofrom_user) + /* first check for a whole page copy on a page boundary */ + cmpldi cr1,r5,16 + cmpdi cr6,r5,4096 + or r0,r3,r4 + neg r6,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ + andi. r0,r0,4095 + std r3,-24(r1) + crand cr0*4+2,cr0*4+2,cr6*4+2 + std r4,-16(r1) + std r5,-8(r1) + dcbt 0,r4 + beq .Lcopy_page + andi. r6,r6,7 + mtcrf 0x01,r5 + blt cr1,.Lshort_co |