aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/lib
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2005-09-26 16:04:21 +1000
committerPaul Mackerras <paulus@samba.org>2005-09-26 16:04:21 +1000
commit14cf11af6cf608eb8c23e989ddb17a715ddce109 (patch)
tree271a97ce73e265f39c569cb159c195c5b4bb3f8c /arch/powerpc/lib
parente5baa396af7560382d2cf3f0871d616b61fc284c (diff)
powerpc: Merge enough to start building in arch/powerpc.
This creates the directory structure under arch/powerpc and a bunch of Kconfig files. It does a first-cut merge of arch/powerpc/mm, arch/powerpc/lib and arch/powerpc/platforms/powermac. This is enough to build a 32-bit powermac kernel with ARCH=powerpc. For now we are getting some unmerged files from arch/ppc/kernel and arch/ppc/syslib, or arch/ppc64/kernel. This makes some minor changes to files in those directories and files outside arch/powerpc. The boot directory is still not merged. That's going to be interesting. Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r--arch/powerpc/lib/Makefile9
-rw-r--r--arch/powerpc/lib/checksum.S225
-rw-r--r--arch/powerpc/lib/checksum64.S229
-rw-r--r--arch/powerpc/lib/copy32.S543
-rw-r--r--arch/powerpc/lib/copypage.S121
-rw-r--r--arch/powerpc/lib/copyuser.S576
-rw-r--r--arch/powerpc/lib/div64.S58
-rw-r--r--arch/powerpc/lib/e2a.c108
-rw-r--r--arch/powerpc/lib/memcpy.S172
-rw-r--r--arch/powerpc/lib/rheap.c693
-rw-r--r--arch/powerpc/lib/sstep.c141
-rw-r--r--arch/powerpc/lib/strcase.c23
-rw-r--r--arch/powerpc/lib/string.S203
-rw-r--r--arch/powerpc/lib/usercopy.c41
14 files changed, 3142 insertions, 0 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
new file mode 100644
index 00000000000..347f9798e43
--- /dev/null
+++ b/arch/powerpc/lib/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for ppc-specific library files..
+#
+
+obj-y := strcase.o string.o
+obj-$(CONFIG_PPC32) += div64.o copy32.o checksum.o
+obj-$(CONFIG_PPC64) += copypage.o copyuser.o memcpy.o usercopy.o \
+ sstep.o checksum64.o
+obj-$(CONFIG_PPC_ISERIES) += e2a.o
diff --git a/arch/powerpc/lib/checksum.S b/arch/powerpc/lib/checksum.S
new file mode 100644
index 00000000000..7874e8a8045
--- /dev/null
+++ b/arch/powerpc/lib/checksum.S
@@ -0,0 +1,225 @@
+/*
+ * This file contains assembly-language implementations
+ * of IP-style 1's complement checksum routines.
+ *
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
+ */
+
+#include <linux/sys.h>
+#include <asm/processor.h>
+#include <asm/errno.h>
+#include <asm/ppc_asm.h>
+
+ .text
+
+/*
+ * ip_fast_csum(buf, len) -- Optimized for IP header
+ * len is in words and is always >= 5.
+ */
+_GLOBAL(ip_fast_csum)
+ lwz r0,0(r3)
+ lwzu r5,4(r3)
+ addic. r4,r4,-2
+ addc r0,r0,r5
+ mtctr r4
+ blelr-
+1: lwzu r4,4(r3)
+ adde r0,r0,r4
+ bdnz 1b
+ addze r0,r0 /* add in final carry */
+ rlwinm r3,r0,16,0,31 /* fold two halves together */
+ add r3,r0,r3
+ not r3,r3
+ srwi r3,r3,16
+ blr
+
+/*
+ * Compute checksum of TCP or UDP pseudo-header:
+ * csum_tcpudp_magic(saddr, daddr, len, proto, sum)
+ */
+_GLOBAL(csum_tcpudp_magic)
+ rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
+ addc r0,r3,r4 /* add 4 32-bit words together */
+ adde r0,r0,r5
+ adde r0,r0,r7
+ addze r0,r0 /* add in final carry */
+ rlwinm r3,r0,16,0,31 /* fold two halves together */
+ add r3,r0,r3
+ not r3,r3
+ srwi r3,r3,16
+ blr
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * csum_partial(buff, len, sum)
+ */
+_GLOBAL(csum_partial)
+ addic r0,r5,0
+ subi r3,r3,4
+ srwi. r6,r4,2
+ beq 3f /* if we're doing < 4 bytes */
+ andi. r5,r3,2 /* Align buffer to longword boundary */
+ beq+ 1f
+ lhz r5,4(r3) /* do 2 bytes to get aligned */
+ addi r3,r3,2
+ subi r4,r4,2
+ addc r0,r0,r5
+ srwi. r6,r4,2 /* # words to do */
+ beq 3f
+1: mtctr r6
+2: lwzu r5,4(r3) /* the bdnz has zero overhead, so it should */
+ adde r0,r0,r5 /* be unnecessary to unroll this loop */
+ bdnz 2b
+ andi. r4,r4,3
+3: cmpwi 0,r4,2
+ blt+ 4f
+ lhz r5,4(r3)
+ addi r3,r3,2
+ subi r4,r4,2
+ adde r0,r0,r5
+4: cmpwi 0,r4,1
+ bne+ 5f
+ lbz r5,4(r3)
+ slwi r5,r5,8 /* Upper byte of word */
+ adde r0,r0,r5
+5: addze r3,r0 /* add in final carry */
+ blr
+
+/*
+ * Computes the checksum of a memory block at src, length len,
+ * and adds in "sum" (32-bit), while copying the block to dst.
+ * If an access exception occurs on src or dst, it stores -EFAULT
+ * to *src_err or *dst_err respectively, and (for an error on
+ * src) zeroes the rest of dst.
+ *
+ * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err)
+ */
+_GLOBAL(csum_partial_copy_generic)
+ addic r0,r6,0
+ subi r3,r3,4
+ subi r4,r4,4
+ srwi. r6,r5,2
+ beq 3f /* if we're doing < 4 bytes */
+ andi. r9,r4,2 /* Align dst to longword boundary */
+ beq+ 1f
+81: lhz r6,4(r3) /* do 2 bytes to get aligned */
+ addi r3,r3,2
+ subi r5,r5,2
+91: sth r6,4(r4)
+ addi r4,r4,2
+ addc r0,r0,r6
+ srwi. r6,r5,2 /* # words to do */
+ beq 3f
+1: srwi. r6,r5,4 /* # groups of 4 words to do */
+ beq 10f
+ mtctr r6
+71: lwz r6,4(r3)
+72: lwz r9,8(r3)
+73: lwz r10,12(r3)
+74: lwzu r11,16(r3)
+ adde r0,r0,r6
+75: stw r6,4(r4)
+ adde r0,r0,r9
+76: stw r9,8(r4)
+ adde r0,r0,r10
+77: stw r10,12(r4)
+ adde r0,r0,r11
+78: stwu r11,16(r4)
+ bdnz 71b
+10: rlwinm. r6,r5,30,30,31 /* # words left to do */
+ beq 13f
+ mtctr r6
+82: lwzu r9,4(r3)
+92: stwu r9,4(r4)
+ adde r0,r0,r9
+ bdnz 82b
+13: andi. r5,r5,3
+3: cmpwi 0,r5,2
+ blt+ 4f
+83: lhz r6,4(r3)
+ addi r3,r3,2
+ subi r5,r5,2
+93: sth r6,4(r4)
+ addi r4,r4,2
+ adde r0,r0,r6
+4: cmpwi 0,r5,1
+ bne+ 5f
+84: lbz r6,4(r3)
+94: stb r6,4(r4)
+ slwi r6,r6,8 /* Upper byte of word */
+ adde r0,r0,r6
+5: addze r3,r0 /* add in final carry */
+ blr
+
+/* These shouldn't go in the fixup section, since that would
+ cause the ex_table addresses to get out of order. */
+
+src_error_4:
+ mfctr r6 /* update # bytes remaining from ctr */
+ rlwimi r5,r6,4,0,27
+ b 79f
+src_error_1:
+ li r6,0
+ subi r5,r5,2
+95: sth r6,4(r4)
+ addi r4,r4,2
+79: srwi. r6,r5,2
+ beq 3f
+ mtctr r6
+src_error_2:
+ li r6,0
+96: stwu r6,4(r4)
+ bdnz 96b
+3: andi. r5,r5,3
+ beq src_error
+src_error_3:
+ li r6,0
+ mtctr r5
+ addi r4,r4,3
+97: stbu r6,1(r4)
+ bdnz 97b
+src_error:
+ cmpwi 0,r7,0
+ beq 1f
+ li r6,-EFAULT
+ stw r6,0(r7)
+1: addze r3,r0
+ blr
+
+dst_error:
+ cmpwi 0,r8,0
+ beq 1f
+ li r6,-EFAULT
+ stw r6,0(r8)
+1: addze r3,r0
+ blr
+
+.section __ex_table,"a"
+ .long 81b,src_error_1
+ .long 91b,dst_error
+ .long 71b,src_error_4
+ .long 72b,src_error_4
+ .long 73b,src_error_4
+ .long 74b,src_error_4
+ .long 75b,dst_error
+ .long 76b,dst_error
+ .long 77b,dst_error
+ .long 78b,dst_error
+ .long 82b,src_error_2
+ .long 92b,dst_error
+ .long 83b,src_error_3
+ .long 93b,dst_error
+ .long 84b,src_error_3
+ .long 94b,dst_error
+ .long 95b,dst_error
+ .long 96b,dst_error
+ .long 97b,dst_error
diff --git a/arch/powerpc/lib/checksum64.S b/arch/powerpc/lib/checksum64.S
new file mode 100644
index 00000000000..ef96c6c58ef
--- /dev/null
+++ b/arch/powerpc/lib/checksum64.S
@@ -0,0 +1,229 @@
+/*
+ * This file contains assembly-language implementations
+ * of IP-style 1's complement checksum routines.
+ *
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
+ */
+
+#include <linux/sys.h>
+#include <asm/processor.h>
+#include <asm/errno.h>
+#include <asm/ppc_asm.h>
+
+/*
+ * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
+ * len is in words and is always >= 5.
+ *
+ * In practice len == 5, but this is not guaranteed. So this code does not
+ * attempt to use doubleword instructions.
+ */
+_GLOBAL(ip_fast_csum)
+ lwz r0,0(r3)
+ lwzu r5,4(r3)
+ addic. r4,r4,-2
+ addc r0,r0,r5
+ mtctr r4
+ blelr-
+1: lwzu r4,4(r3)
+ adde r0,r0,r4
+ bdnz 1b
+ addze r0,r0 /* add in final carry */
+ rldicl r4,r0,32,0 /* fold two 32-bit halves together */
+ add r0,r0,r4
+ srdi r0,r0,32
+ rlwinm r3,r0,16,0,31 /* fold two halves together */
+ add r3,r0,r3
+ not r3,r3
+ srwi r3,r3,16
+ blr
+
+/*
+ * Compute checksum of TCP or UDP pseudo-header:
+ * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
+ * No real gain trying to do this specially for 64 bit, but
+ * the 32 bit addition may spill into the upper bits of
+ * the doubleword so we still must fold it down from 64.
+ */
+_GLOBAL(csum_tcpudp_magic)
+ rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
+ addc r0,r3,r4 /* add 4 32-bit words together */
+ adde r0,r0,r5
+ adde r0,r0,r7
+ rldicl r4,r0,32,0 /* fold 64 bit value */
+ add r0,r4,r0
+ srdi r0,r0,32
+ rlwinm r3,r0,16,0,31 /* fold two halves together */
+ add r3,r0,r3
+ not r3,r3
+ srwi r3,r3,16
+ blr
+
+/*
+ * Computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit).
+ *
+ * This code assumes at least halfword alignment, though the length
+ * can be any number of bytes. The sum is accumulated in r5.
+ *
+ * csum_partial(r3=buff, r4=len, r5=sum)
+ */
+_GLOBAL(csum_partial)
+ subi r3,r3,8 /* we'll offset by 8 for the loads */
+ srdi. r6,r4,3 /* divide by 8 for doubleword count */
+ addic r5,r5,0 /* clear carry */
+ beq 3f /* if we're doing < 8 bytes */
+ andi. r0,r3,2 /* aligned on a word boundary already? */
+ beq+ 1f
+ lhz r6,8(r3) /* do 2 bytes to get aligned */
+ addi r3,r3,2
+ subi r4,r4,2
+ addc r5,r5,r6
+ srdi. r6,r4,3 /* recompute number of doublewords */
+ beq 3f /* any left? */
+1: mtctr r6
+2: ldu r6,8(r3) /* main sum loop */
+ adde r5,r5,r6
+ bdnz 2b
+ andi. r4,r4,7 /* compute bytes left to sum after doublewords */
+3: cmpwi 0,r4,4 /* is at least a full word left? */
+ blt 4f
+ lwz r6,8(r3) /* sum this word */
+ addi r3,r3,4
+ subi r4,r4,4
+ adde r5,r5,r6
+4: cmpwi 0,r4,2 /* is at least a halfword left? */
+ blt+ 5f
+ lhz r6,8(r3) /* sum this halfword */
+ addi r3,r3,2
+ subi r4,r4,2
+ adde r5,r5,r6
+5: cmpwi 0,r4,1 /* is at least a byte left? */
+ bne+ 6f
+ lbz r6,8(r3) /* sum this byte */
+ slwi r6,r6,8 /* this byte is assumed to be the upper byte of a halfword */
+ adde r5,r5,r6
+6: addze r5,r5 /* add in final carry */
+ rldicl r4,r5,32,0 /* fold two 32-bit halves together */
+ add r3,r4,r5
+ srdi r3,r3,32
+ blr
+
+/*
+ * Computes the checksum of a memory block at src, length len,
+ * and adds in "sum" (32-bit), while copying the block to dst.
+ * If an access exception occurs on src or dst, it stores -EFAULT
+ * to *src_err or *dst_err respectively, and (for an error on
+ * src) zeroes the rest of dst.
+ *
+ * This code needs to be reworked to take advantage of 64 bit sum+copy.
+ * However, due to tokenring halfword alignment problems this will be very
+ * tricky. For now we'll leave it until we instrument it somehow.
+ *
+ * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
+ */
+_GLOBAL(csum_partial_copy_generic)
+ addic r0,r6,0
+ subi r3,r3,4
+ subi r4,r4,4
+ srwi. r6,r5,2
+ beq 3f /* if we're doing < 4 bytes */
+ andi. r9,r4,2 /* Align dst to longword boundary */
+ beq+ 1f
+81: lhz r6,4(r3) /* do 2 bytes to get aligned */
+ addi r3,r3,2
+ subi r5,r5,2
+91: sth r6,4(r4)
+ addi r4,r4,2
+ addc r0,r0,r6
+ srwi. r6,r5,2 /* # words to do */
+ beq 3f
+1: mtctr r6
+82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */
+92: stwu r6,4(r4) /* be unnecessary to unroll this loop */
+ adde r0,r0,r6
+ bdnz 82b
+ andi. r5,r5,3
+3: cmpwi 0,r5,2
+ blt+ 4f
+83: lhz r6,4(r3)
+ addi r3,r3,2
+ subi r5,r5,2
+93: sth r6,4(r4)
+ addi r4,r4,2
+ adde r0,r0,r6
+4: cmpwi 0,r5,1
+ bne+ 5f
+84: lbz r6,4(r3)
+94: stb r6,4(r4)
+ slwi r6,r6,8 /* Upper byte of word */
+ adde r0,r0,r6
+5: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */
+ rldicl r4,r3,32,0 /* fold 64 bit value */
+ add r3,r4,r3
+ srdi r3,r3,32
+ blr
+
+/* These shouldn't go in the fixup section, since that would
+ cause the ex_table addresses to get out of order. */
+
+ .globl src_error_1
+src_error_1:
+ li r6,0
+ subi r5,r5,2
+95: sth r6,4(r4)
+ addi r4,r4,2
+ srwi. r6,r5,2
+ beq 3f
+ mtctr r6
+ .globl src_error_2
+src_error_2:
+ li r6,0
+96: stwu r6,4(r4)
+ bdnz 96b
+3: andi. r5,r5,3
+ beq src_error
+ .globl src_error_3
+src_error_3:
+ li r6,0
+ mtctr r5
+ addi r4,r4,3
+97: stbu r6,1(r4)
+ bdnz 97b
+ .globl src_error
+src_error:
+ cmpdi 0,r7,0
+ beq 1f
+ li r6,-EFAULT
+ stw r6,0(r7)
+1: addze r3,r0
+ blr
+
+ .globl dst_error
+dst_error:
+ cmpdi 0,r8,0
+ beq 1f
+ li r6,-EFAULT
+ stw r6,0(r8)
+1: addze r3,r0
+ blr
+
+.section __ex_table,"a"
+ .align 3
+ .llong 81b,src_error_1
+ .llong 91b,dst_error
+ .llong 82b,src_error_2
+ .llong 92b,dst_error
+ .llong 83b,src_error_3
+ .llong 93b,dst_error
+ .llong 84b,src_error_3
+ .llong 94b,dst_error
+ .llong 95b,dst_error
+ .llong 96b,dst_error
+ .llong 97b,dst_error
diff --git a/arch/powerpc/lib/copy32.S b/arch/powerpc/lib/copy32.S
new file mode 100644
index 00000000000..420a912198a
--- /dev/null
+++ b/arch/powerpc/lib/copy32.S
@@ -0,0 +1,543 @@
+/*
+ * Memory copy functions for 32-bit PowerPC.
+ *
+ * Copyright (C) 1996-2005 Paul Mackerras.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/ppc_asm.h>
+
+#define COPY_16_BYTES \
+ lwz r7,4(r4); \
+ lwz r8,8(r4); \
+ lwz r9,12(r4); \
+ lwzu r10,16(r4); \
+ stw r7,4(r6); \
+ stw r8,8(r6); \
+ stw r9,12(r6); \
+ stwu r10,16(r6)
+
+#define COPY_16_BYTES_WITHEX(n) \
+8 ## n ## 0: \
+ lwz r7,4(r4); \
+8 ## n ## 1: \
+ lwz r8,8(r4); \
+8 ## n ## 2: \
+ lwz r9,12(r4); \
+8 ## n ## 3: \
+ lwzu r10,16(r4); \
+8 ## n ## 4: \
+ stw r7,4(r6); \
+8 ## n ## 5: \
+ stw r8,8(r6); \
+8 ## n ## 6: \
+ stw r9,12(r6); \
+8 ## n ## 7: \
+ stwu r10,16(r6)
+
+#define COPY_16_BYTES_EXCODE(n) \
+9 ## n ## 0: \
+ addi r5,r5,-(16 * n); \
+ b 104f; \
+9 ## n ## 1: \
+ addi r5,r5,-(16 * n); \
+ b 105f; \
+.section __ex_table,"a"; \
+ .align 2; \
+ .long 8 ## n ## 0b,9 ## n ## 0b; \
+ .long 8 ## n ## 1b,9 ## n ## 0b; \
+ .long 8 ## n ## 2b,9 ## n ## 0b; \
+ .long 8 ## n ## 3b,9 ## n ## 0b; \
+ .long 8 ## n ## 4b,9 ## n ## 1b; \
+ .long 8 ## n ## 5b,9 ## n ## 1b; \
+ .long 8 ## n ## 6b,9 ## n ## 1b; \
+ .long 8 ## n ## 7b,9 ## n ## 1b; \
+ .text
+
+ .text
+ .stabs "arch/powerpc/lib/",N_SO,0,0,0f
+ .stabs "copy32.S",N_SO,0,0,0f
+0:
+
+CACHELINE_BYTES = L1_CACHE_LINE_SIZE
+LG_CACHELINE_BYTES = LG_L1_CACHE_LINE_SIZE
+CACHELINE_MASK = (L1_CACHE_LINE_SIZE-1)
+
+/*
+ * Use dcbz on the complete cache lines in the destination
+ * to set them to zero. This requires that the destination
+ * area is cacheable. -- paulus
+ */
+_GLOBAL(cacheable_memzero)
+ mr r5,r4
+ li r4,0
+ addi r6,r3,-4
+ cmplwi 0,r5,4
+ blt 7f
+ stwu r4,4(r6)
+ beqlr
+ andi. r0,r6,3
+ add r5,r0,r5
+ subf r6,r0,r6
+ clrlwi r7,r6,32-LG_CACHELINE_BYTES
+ add r8,r7,r5
+ srwi r9,r8,LG_CACHELINE_BYTES
+ addic. r9,r9,-1 /* total number of complete cachelines */
+ ble 2f
+ xori r0,r7,CACHELINE_MASK & ~3
+ srwi. r0,r0,2
+ beq 3f
+ mtctr r0
+4: stwu r4,4(r6)
+ bdnz 4b
+3: mtctr r9
+ li r7,4
+#if !defined(CONFIG_8xx)
+10: dcbz r7,r6
+#else
+10: stw r4, 4(r6)
+ stw r4, 8(r6)
+ stw r4, 12(r6)
+ stw r4, 16(r6)
+#if CACHE_LINE_SIZE >= 32
+ stw r4, 20(r6)
+ stw r4, 24(r6)
+ stw r4, 28(r6)
+ stw r4, 32(r6)
+#endif /* CACHE_LINE_SIZE */
+#endif
+ addi r6,r6,CACHELINE_BYTES
+ bdnz 10b
+ clrlwi r5,r8,32-LG_CACHELINE_BYTES
+ addi r5,r5,4
+2: srwi r0,r5,2
+ mtctr r0
+ bdz 6f
+1: stwu r4,4(r6)
+ bdnz 1b
+6: andi. r5,r5,3
+7: cmpwi 0,r5,0
+ beqlr
+ mtctr r5
+ addi r6,r6,3
+8: stbu r4,1(r6)
+ bdnz 8b
+ blr
+
+_GLOBAL(memset)
+ rlwimi r4,r4,8,16,23
+ rlwimi r4,r4,16,0,15
+ addi r6,r3,-4
+ cmplwi 0,r5,4
+ blt 7f
+ stwu r4,4(r6)
+ beqlr
+ andi. r0,r6,3
+ add r5,r0,r5
+ subf r6,r0,r6
+ srwi r0,r5,2
+ mtctr r0
+ bdz 6f
+1: stwu r4,4(r6)
+ bdnz 1b
+6: andi. r5,r5,3
+7: cmpwi 0,r5,0
+ beqlr
+ mtctr r5
+ addi r6,r6,3
+8: stbu r4,1(r6)
+ bdnz 8b
+ blr
+
+/*
+ * This version uses dcbz on the complete cache lines in the
+ * destination area to reduce memory traffic. This requires that
+ * the destination area is cacheable.
+ * We only use this version if the source and dest don't overlap.
+ * -- paulus.
+ */
+_GLOBAL(cacheable_memcpy)
+ add r7,r3,r5 /* test if the src & dst overlap */
+ add r8,r4,r5
+ cmplw 0,r4,r7
+ cmplw 1,r3,r8
+ crand 0,0,4 /* cr0.lt &= cr1.lt */
+ blt memcpy /* if regions overlap */
+
+ addi r4,r4,-4
+ addi r6,r3,-4
+ neg r0,r3
+ andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
+ beq 58f
+
+ cmplw 0,r5,r0 /* is this more than total to do? */
+ blt 63f /* if not much to do */
+ andi. r8,r0,3 /* get it word-aligned first */
+ subf r5,r0,r5
+ mtctr r8
+ beq+ 61f
+70: lbz r9,4(r4) /* do some bytes */
+ stb r9,4(r6)
+ addi r4,r4,1
+ addi r6,r6,1
+ bdnz 70b
+61: srwi. r0,r0,2
+ mtctr r0
+ beq 58f
+72: lwzu r9,4(r4) /* do some words */
+ stwu r9,4(r6)
+ bdnz 72b
+
+58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
+ clrlwi r5,r5,32-LG_CACHELINE_BYTES
+ li r11,4
+ mtctr r0
+ beq 63f
+53:
+#if !defined(CONFIG_8xx)
+ dcbz r11,r6
+#endif
+ COPY_16_BYTES
+#if L1_CACHE_LINE_SIZE >= 32
+ COPY_16_BYTES
+#if L1_CACHE_LINE_SIZE >= 64
+ COPY_16_BYTES
+ COPY_16_BYTES
+#if L1_CACHE_LINE_SIZE >= 128
+ COPY_16_BYTES
+ COPY_16_BYTES
+ COPY_16_BYTES
+ COPY_16_BYTES
+#endif
+#endif
+#endif
+ bdnz 53b
+
+63: srwi. r0,r5,2
+ mtctr r0
+ beq 64f
+30: lwzu r0,4(r4)
+ stwu r0,4(r6)
+ bdnz 30b
+
+64: andi. r0,r5,3
+ mtctr r0
+ beq+ 65f
+40: lbz r0,4(r4)
+ stb r0,4(r6)
+ addi r4,r4,1
+ addi r6,r6,1
+ bdnz 40b
+65: blr
+
+_GLOBAL(memmove)
+ cmplw 0,r3,r4
+ bgt backwards_memcpy
+ /* fall through */
+
+_GLOBAL(memcpy)
+ srwi. r7,r5,3
+ addi r6,r3,-4
+ addi r4,r4,-4
+ beq 2f /* if less than 8 bytes to do */
+ andi. r0,r6,3 /* get dest word aligned */
+ mtctr r7
+ bne 5f
+1: lwz r7,4(r4)
+ lwzu r8,8(r4)
+ stw r7,4(r6)
+ stwu r8,8(r6)
+ bdnz 1b
+ andi. r5,r5,7
+2: cmplwi 0,r5,4
+ blt 3f
+ lwzu r0,4(r4)
+ addi r5,r5,-4
+ stwu r0,4(r6)
+3: cmpwi 0,r5,0
+ beqlr
+ mtctr r5
+ addi r4,r4,3
+ addi r6,r6,3
+4: lbzu r0,1(r4)
+ stbu r0,1(r6)
+ bdnz 4b
+ blr
+5: subfic r0,r0,4
+ mtctr r0
+6: lbz r7,4(r4)
+ addi r4,r4,1
+ stb r7,4(r6)
+ addi r6,r6,1
+ bdnz 6b
+ subf r5,r0,r5
+ rlwinm. r7,r5,32-3,3,31
+ beq 2b
+ mtctr r7
+ b 1b
+
+_GLOBAL(backwards_memcpy)
+ rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
+ add r6,r3,r5
+ add r4,r4,r5
+ beq 2f
+ andi. r0,r6,3
+ mtctr r7
+ bne 5f
+1: lwz r7,-4(r4)
+ lwzu r8,-8(r4)
+ stw r7,-4(r6)
+ stwu r8,-8(r6)
+ bdnz 1b
+ andi. r5,r5,7
+2: cmplwi 0,r5,4
+ blt 3f
+ lwzu r0,-4(r4)
+ subi r5,r5,4
+ stwu r0,-4(r6)
+3: cmpwi 0,r5,0
+ beqlr
+ mtctr r5
+4: lbzu r0,-1(r4)
+ stbu r0,-1(r6)
+ bdnz 4b
+ blr
+5: mtctr r0
+6: lbzu r7,-1(r4)
+ stbu r7,-1(r6)
+ bdnz 6b
+ subf r5,r0,r5
+ rlwinm. r7,r5,32-3,3,31
+ beq 2b
+ mtctr r7
+ b 1b
+
+_GLOBAL(__copy_tofrom_user)
+ addi r4,r4,-4
+ addi r6,r3,-4
+ neg r0,r3
+ andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
+ beq 58f
+
+ cmplw 0,r5,r0 /* is this more than total to do? */
+ blt 63f /* if not much to do */
+ andi. r8,r0,3 /* get it word-aligned first */
+ mtctr r8
+ beq+ 61f
+70: lbz r9,4(r4) /* do some bytes */
+71: stb r9,4(r6)
+ addi r4,r4,1
+ addi r6,r6,1
+ bdnz 70b
+61: subf r5,r0,r5
+ srwi. r0,r0,2
+ mtctr r0
+ beq 58f
+72: lwzu r9,4(r4) /* do some words */
+73: stwu r9,4(r6)
+ bdnz 72b
+
+ .section __ex_table,"a"
+ .align 2
+ .long 70b,100f
+ .long 71b,101f
+ .long 72b,102f
+ .long 73b,103f
+ .text
+
+58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
+ clrlwi r5,r5,32-LG_CACHELINE_BYTES
+ li r11,4
+ beq 63f
+
+#ifdef CONFIG_8xx
+ /* Don't use prefetch on 8xx */
+ mtctr r0
+ li r0,0
+53: COPY_16_BYTES_WITHEX(0)
+ bdnz 53b
+
+#else /* not CONFIG_8xx */
+ /* Here we decide how far ahead to prefetch the source */
+ li r3,4
+ cmpwi r0,1
+ li r7,0
+ ble 114f
+ li r7,1
+#if MAX_COPY_PREFETCH > 1
+ /* Heuristically, for large transfers we prefetch
+ MAX_COPY_PREFETCH cachelines ahead. For small transfers
+ we prefetch 1 cacheline ahead. */
+ cmpwi r0,MAX_COPY_PREFETCH
+ ble 112f
+ li r7,MAX_COPY_PREFETCH
+112: mtctr r7
+111: dcbt r3,r4
+ addi r3,r3,CACHELINE_BYTES
+ bdnz 111b
+#else
+ dcbt r3,r4
+ addi r3,r3,CACHELINE_BYTES
+#endif /* MAX_COPY_PREFETCH > 1 */
+
+114: subf r8,r7,r0
+ mr r0,r7
+ mtctr r8
+
+53: dcbt r3,r4
+54: dcbz r11,r6
+ .section __ex_table,"a"
+ .align 2
+ .long 54b,105f
+ .text
+/* the main body of the cacheline loop */
+ COPY_16_BYTES_WITHEX(0)
+#if L1_CACHE_LINE_SIZE >= 32
+ COPY_16_BYTES_WITHEX(1)
+#if L1_CACHE_LINE_SIZE >= 64
+ COPY_16_BYTES_WITHEX(2)
+ COPY_16_BYTES_WITHEX(3)
+#if L1_CACHE_LINE_SIZE >= 128
+ COPY_16_BYTES_WITHEX(4)
+ COPY_16_BYTES_WITHEX(5)
+ COPY_16_BYTES_WITHEX(6)
+ COPY_16_BYTES_WITHEX(7)
+#endif
+#endif
+#endif
+ bdnz 53b
+ cmpwi r0,0
+ li r3,4
+ li r7,0
+ bne 114b
+#endif /* CONFIG_8xx */
+
+63: srwi. r0,r5,2
+ mtctr r0
+ beq 64f
+30: lwzu r0,4(r4)
+31: stwu r0,4(r6)
+ bdnz 30b
+
+64: andi. r0,r5,3
+ mtctr r0
+ beq+ 65f
+40: lbz r0,4(r4)
+41: stb r0,4(r6)
+ addi r4,r4,1
+ addi r6,r6,1
+ bdnz 40b
+65: li r3,0
+ blr
+
+/* read fault, initial single-byte copy */
+100: li r9,0
+ b 90f
+/* write fault, initial single-byte copy */
+101: li r9,1
+90: subf r5,r8,r5
+ li r3,0
+ b 99f
+/* read fault, initial word copy */
+102: li r9,0
+ b 91f
+/* write fault, initial word copy */
+103: li r9,1
+91: li r3,2
+ b 99f
+
+/*
+ * this stuff handles faults in the cacheline loop and branches to either
+ * 104f (if in read part) or 105f (if in write part), after updating r5
+ */
+ COPY_16_BYTES_EXCODE(0)
+#if L1_CACHE_LINE_SIZE >= 32
+ COPY_16_BYTES_EXCODE(1)
+#if L1_CACHE_LINE_SIZE >= 64
+ COPY_16_BYTES_EXCODE(2)
+ COPY_16_BYTES_EXCODE(3)
+#if L1_CACHE_LINE_SIZE >= 128
+ COPY_16_BYTES_EXCODE(4)
+ COPY_16_BYTES_EXCODE(5)
+ COPY_16_BYTES_EXCODE(6)
+ COPY_16_BYTES_EXCODE(7)
+#endif
+#endif
+#endif
+
+/* read fault in cacheline loop */
+104: li r9,0
+ b 92f
+/* fault on dcbz (effectively a write fault) */
+/* or write fault in cacheline loop */
+105: li r9,1
+92: li r3,LG_CACHELINE_BYTES
+ mfctr r8
+ add r0,r0,r8
+ b 106f
+/* read fault in final word loop */
+108: li r9,0
+ b 93f
+/* write fault in final word loop */
+109: li r9,1
+93: andi. r5,r5,3
+ li r3,2
+ b 99f
+/* read fault in final byte loop */
+110: li r9,0
+ b 94f
+/* write fault in final byte loop */
+111: li r9,1
+94: li r5,0
+ li r3,0
+/*
+ * At this stage the number of bytes not copied is
+ * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
+ */
+99: mfctr r0
+106: slw r3,r0,r3
+ add. r3,r3,r5
+ beq 120f /* shouldn't happen */
+ cmpwi 0,r9,0
+ bne 120f
+/* for a read fault, first try to continue the copy one byte at a time */
+ mtctr r3
+130: lbz r0,4(r4)
+131: stb r0,4(r6)
+ addi r4,r4,1
+ addi r6,r6,1
+ bdnz 130b
+/* then clear out the destination: r3 bytes starting at 4(r6) */
+132: mfctr r3
+ srwi. r0,r3,2
+ li r9,0
+ mtctr r0
+ beq 113f
+112: stwu r9,4(r6)
+ bdnz 112b
+113: andi. r0,r3,3
+ mtctr r0
+ beq 120f
+114: stb r9,4(r6)
+ addi r6,r6,1
+ bdnz 114b
+120: blr
+
+ .section __ex_table,"a"
+ .align 2
+ .long 30b,108b
+ .long 31b,109b
+ .long 40b,110b
+ .long 41b,111b
+ .long 130b,132b
+ .long 131b,120b
+ .long 112b,120b
+ .long 114b,120b
+ .text
diff --git a/arch/powerpc/lib/copypage.S b/arch/powerpc/lib/copypage.S
new file mode 100644
index 00000000000..733d61618bb
--- /dev/null
+++ b/arch/powerpc/lib/copypage.S
@@ -0,0 +1,121 @@
+/*
+ * arch/ppc64/lib/copypage.S
+ *
+ * Copyright (C) 2002 Paul Mackerras, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+
+_GLOBAL(copy_page)
+ std r31,-8(1)
+ std r30,-16(1)
+ std r29,-24(1)
+ std r28,-32(1)
+ std r27,-40(1)
+ std r26,-48(1)
+ std r25,-56(1)
+ std r24,-64(1)
+ std r23,-72(1)
+ std r22,-80(1)
+ std r21,-88(1)
+ std r20,-96(1)
+ li r5,4096/32 - 1
+ addi r3,r3,-8
+ li r12,5
+0: addi r5,r5,-24
+ mtctr r12
+ ld r22,640(4)
+ ld r21,512(4)
+ ld r20,384(4)
+ ld r11,256(4)
+ ld r9,128(4)
+ ld r7,0(4)
+ ld r25,648(4)
+ ld r24,520(4)
+ ld r23,392(4)
+ ld r10,264(4)
+ ld r8,136(4)
+ ldu r6,8(4)
+ cmpwi r5,24
+1: std r22,648(3)
+ std r21,520(3)
+ std r20,392(3)
+ std r11,264(3)
+ std r9,136(3)
+ std r7,8(3)
+ ld r28,648(4)
+ ld r27,520(4)
+ ld r26,392(4)
+ ld r31,264(4)
+ ld r30,136(4)
+ ld r29,8(4)
+ std r25,656(3)
+ std r24,528(3)
+ std r23,400(3)
+ std r10,272(3)
+ std r8,144(3)
+ std r6,16(3)
+ ld r22,656(4)
+ ld r21,528(4)
+ ld r20,400(4)
+ ld r11,272(4)
+ ld r9,144(4)
+ ld r7,16(4)
+ std r28,664(3)
+ std r27,536(3)
+ std r26,408(3)
+ std r31,280(3)
+ std r30,152(3)
+ stdu r29,24(3)
+ ld r25,664(4)
+ ld r24,536(4)
+ ld r23,408(4)
+ ld r10,280(4)
+ ld r8,152(4)
+ ldu r6,24(4)
+ bdnz 1b
+ std r22,648(3)
+ std r21,520(3)
+ std r20,392(3)
+ std r11,264(3)
+ std r9,136(3)
+ std r7,8(3)
+ addi r4,r4,640
+ addi r3,r3,648
+ bge 0b
+ mtctr r5
+ ld r7,0(4)
+ ld r8,8(4)
+ ldu r9,16(4)
+3: ld r10,8(4)
+ std r7,8(3)
+ ld r7,16(4)
+ std r8,16(3)
+ ld r8,24(4)
+ std r9,24(3)
+ ldu r9,32(4)
+ stdu r10,32(3)
+ bdnz 3b
+4: ld r10,8(4)
+ std r7,8(3)
+ std r8,16(3)
+ std r9,24(3)
+ std r10,32(3)
+9: ld r20,-96(1)
+ ld r21,-88(1)
+ ld r22,-80(1)
+ ld r23,-72(1)
+ ld r24,-64(1)
+ ld r25,-56(1)
+ ld r26,-48(1)
+ ld r27,-40(1)
+ ld r28,-32(1)
+ ld r29,-24(1)
+ ld r30,-16(1)
+ ld r31,-8(1)
+ blr
diff --git a/arch/powerpc/lib/copyuser.S b/arch/powerpc/lib/copyuser.S
new file mode 100644
index 00000000000..a0b3fbbd6fb
--- /dev/null
+++ b/arch/powerpc/lib/copyuser.S
@@ -0,0 +1,576 @@
+/*
+ * arch/ppc64/lib/copyuser.S
+ *
+ * Copyright (C) 2002 Paul Mackerras, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+
+ .align 7
+_GLOBAL(__copy_tofrom_user)
+ /* first check for a whole page copy on a page boundary */
+ cmpldi cr1,r5,16
+ cmpdi cr6,r5,4096
+ or r0,r3,r4
+ neg r6,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
+ andi. r0,r0,4095
+ std r3,-24(r1)
+ crand cr0*4+2,cr0*4+2,cr6*4+2
+ std r4,-16(r1)
+ std r5,-8(r1)
+ dcbt 0,r4
+ beq .Lcopy_page
+ andi. r6,r6,7
+ mtcrf 0x01,r5
+ blt cr1,.Lshort_co