aboutsummaryrefslogtreecommitdiff
path: root/arch/sparc64/lib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/sparc64/lib
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'arch/sparc64/lib')
-rw-r--r--arch/sparc64/lib/Makefile20
-rw-r--r--arch/sparc64/lib/PeeCeeI.c237
-rw-r--r--arch/sparc64/lib/U1copy_from_user.S33
-rw-r--r--arch/sparc64/lib/U1copy_to_user.S33
-rw-r--r--arch/sparc64/lib/U1memcpy.S560
-rw-r--r--arch/sparc64/lib/U3copy_from_user.S22
-rw-r--r--arch/sparc64/lib/U3copy_to_user.S33
-rw-r--r--arch/sparc64/lib/U3memcpy.S422
-rw-r--r--arch/sparc64/lib/U3patch.S32
-rw-r--r--arch/sparc64/lib/VISsave.S131
-rw-r--r--arch/sparc64/lib/atomic.S139
-rw-r--r--arch/sparc64/lib/bitops.S145
-rw-r--r--arch/sparc64/lib/bzero.S158
-rw-r--r--arch/sparc64/lib/checksum.S172
-rw-r--r--arch/sparc64/lib/clear_page.S105
-rw-r--r--arch/sparc64/lib/copy_in_user.S119
-rw-r--r--arch/sparc64/lib/copy_page.S242
-rw-r--r--arch/sparc64/lib/csum_copy.S308
-rw-r--r--arch/sparc64/lib/csum_copy_from_user.S21
-rw-r--r--arch/sparc64/lib/csum_copy_to_user.S21
-rw-r--r--arch/sparc64/lib/debuglocks.c376
-rw-r--r--arch/sparc64/lib/dec_and_lock.S78
-rw-r--r--arch/sparc64/lib/delay.c49
-rw-r--r--arch/sparc64/lib/find_bit.c127
-rw-r--r--arch/sparc64/lib/iomap.c48
-rw-r--r--arch/sparc64/lib/ipcsum.S34
-rw-r--r--arch/sparc64/lib/mcount.S61
-rw-r--r--arch/sparc64/lib/memcmp.S28
-rw-r--r--arch/sparc64/lib/memmove.S31
-rw-r--r--arch/sparc64/lib/memscan.S129
-rw-r--r--arch/sparc64/lib/rwsem.S165
-rw-r--r--arch/sparc64/lib/strlen.S80
-rw-r--r--arch/sparc64/lib/strlen_user.S95
-rw-r--r--arch/sparc64/lib/strncmp.S32
-rw-r--r--arch/sparc64/lib/strncpy_from_user.S139
-rw-r--r--arch/sparc64/lib/user_fixup.c71
-rw-r--r--arch/sparc64/lib/xor.S354
37 files changed, 4850 insertions, 0 deletions
diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile
new file mode 100644
index 00000000000..40dbeec7e5d
--- /dev/null
+++ b/arch/sparc64/lib/Makefile
@@ -0,0 +1,20 @@
+# $Id: Makefile,v 1.25 2000/12/14 22:57:25 davem Exp $
+# Makefile for Sparc64 library files..
+#
+
+EXTRA_AFLAGS := -ansi
+EXTRA_CFLAGS := -Werror
+
+lib-y := PeeCeeI.o copy_page.o clear_page.o strlen.o strncmp.o \
+ memscan.o strncpy_from_user.o strlen_user.o memcmp.o checksum.o \
+ bzero.o csum_copy.o csum_copy_from_user.o csum_copy_to_user.o \
+ VISsave.o atomic.o bitops.o \
+ U1memcpy.o U1copy_from_user.o U1copy_to_user.o \
+ U3memcpy.o U3copy_from_user.o U3copy_to_user.o U3patch.o \
+ copy_in_user.o user_fixup.o memmove.o \
+ mcount.o ipcsum.o rwsem.o xor.o find_bit.o delay.o
+
+lib-$(CONFIG_DEBUG_SPINLOCK) += debuglocks.o
+lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
+
+obj-y += iomap.o
diff --git a/arch/sparc64/lib/PeeCeeI.c b/arch/sparc64/lib/PeeCeeI.c
new file mode 100644
index 00000000000..3008d536e8c
--- /dev/null
+++ b/arch/sparc64/lib/PeeCeeI.c
@@ -0,0 +1,237 @@
+/* $Id: PeeCeeI.c,v 1.4 1999/09/06 01:17:35 davem Exp $
+ * PeeCeeI.c: The emerging standard...
+ *
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ */
+
+#include <asm/io.h>
+#include <asm/byteorder.h>
+
+void outsb(void __iomem *addr, const void *src, unsigned long count)
+{
+ const u8 *p = src;
+
+ while(count--)
+ outb(*p++, addr);
+}
+
+void outsw(void __iomem *addr, const void *src, unsigned long count)
+{
+ if(count) {
+ u16 *ps = (u16 *)src;
+ u32 *pi;
+
+ if(((u64)src) & 0x2) {
+ u16 val = le16_to_cpup(ps);
+ outw(val, addr);
+ ps++;
+ count--;
+ }
+ pi = (u32 *)ps;
+ while(count >= 2) {
+ u32 w = le32_to_cpup(pi);
+
+ pi++;
+ outw(w >> 0, addr);
+ outw(w >> 16, addr);
+ count -= 2;
+ }
+ ps = (u16 *)pi;
+ if(count) {
+ u16 val = le16_to_cpup(ps);
+ outw(val, addr);
+ }
+ }
+}
+
+void outsl(void __iomem *addr, const void *src, unsigned long count)
+{
+ if(count) {
+ if((((u64)src) & 0x3) == 0) {
+ u32 *p = (u32 *)src;
+ while(count--) {
+ u32 val = cpu_to_le32p(p);
+ outl(val, addr);
+ p++;
+ }
+ } else {
+ u8 *pb;
+ u16 *ps = (u16 *)src;
+ u32 l = 0, l2;
+ u32 *pi;
+
+ switch(((u64)src) & 0x3) {
+ case 0x2:
+ count -= 1;
+ l = cpu_to_le16p(ps) << 16;
+ ps++;
+ pi = (u32 *)ps;
+ while(count--) {
+ l2 = cpu_to_le32p(pi);
+ pi++;
+ outl(((l >> 16) | (l2 << 16)), addr);
+ l = l2;
+ }
+ ps = (u16 *)pi;
+ l2 = cpu_to_le16p(ps);
+ outl(((l >> 16) | (l2 << 16)), addr);
+ break;
+
+ case 0x1:
+ count -= 1;
+ pb = (u8 *)src;
+ l = (*pb++ << 8);
+ ps = (u16 *)pb;
+ l2 = cpu_to_le16p(ps);
+ ps++;
+ l |= (l2 << 16);
+ pi = (u32 *)ps;
+ while(count--) {
+ l2 = cpu_to_le32p(pi);
+ pi++;
+ outl(((l >> 8) | (l2 << 24)), addr);
+ l = l2;
+ }
+ pb = (u8 *)pi;
+ outl(((l >> 8) | (*pb << 24)), addr);
+ break;
+
+ case 0x3:
+ count -= 1;
+ pb = (u8 *)src;
+ l = (*pb++ << 24);
+ pi = (u32 *)pb;
+ while(count--) {
+ l2 = cpu_to_le32p(pi);
+ pi++;
+ outl(((l >> 24) | (l2 << 8)), addr);
+ l = l2;
+ }
+ ps = (u16 *)pi;
+ l2 = cpu_to_le16p(ps);
+ ps++;
+ pb = (u8 *)ps;
+ l2 |= (*pb << 16);
+ outl(((l >> 24) | (l2 << 8)), addr);
+ break;
+ }
+ }
+ }
+}
+
+void insb(void __iomem *addr, void *dst, unsigned long count)
+{
+ if(count) {
+ u32 *pi;
+ u8 *pb = dst;
+
+ while((((unsigned long)pb) & 0x3) && count--)
+ *pb++ = inb(addr);
+ pi = (u32 *)pb;
+ while(count >= 4) {
+ u32 w;
+
+ w = (inb(addr) << 24);
+ w |= (inb(addr) << 16);
+ w |= (inb(addr) << 8);
+ w |= (inb(addr) << 0);
+ *pi++ = w;
+ count -= 4;
+ }
+ pb = (u8 *)pi;
+ while(count--)
+ *pb++ = inb(addr);
+ }
+}
+
+void insw(void __iomem *addr, void *dst, unsigned long count)
+{
+ if(count) {
+ u16 *ps = dst;
+ u32 *pi;
+
+ if(((unsigned long)ps) & 0x2) {
+ *ps++ = le16_to_cpu(inw(addr));
+ count--;
+ }
+ pi = (u32 *)ps;
+ while(count >= 2) {
+ u32 w;
+
+ w = (le16_to_cpu(inw(addr)) << 16);
+ w |= (le16_to_cpu(inw(addr)) << 0);
+ *pi++ = w;
+ count -= 2;
+ }
+ ps = (u16 *)pi;
+ if(count)
+ *ps = le16_to_cpu(inw(addr));
+ }
+}
+
+void insl(void __iomem *addr, void *dst, unsigned long count)
+{
+ if(count) {
+ if((((unsigned long)dst) & 0x3) == 0) {
+ u32 *pi = dst;
+ while(count--)
+ *pi++ = le32_to_cpu(inl(addr));
+ } else {
+ u32 l = 0, l2, *pi;
+ u16 *ps;
+ u8 *pb;
+
+ switch(((unsigned long)dst) & 3) {
+ case 0x2:
+ ps = dst;
+ count -= 1;
+ l = le32_to_cpu(inl(addr));
+ *ps++ = l;
+ pi = (u32 *)ps;
+ while(count--) {
+ l2 = le32_to_cpu(inl(addr));
+ *pi++ = (l << 16) | (l2 >> 16);
+ l = l2;
+ }
+ ps = (u16 *)pi;
+ *ps = l;
+ break;
+
+ case 0x1:
+ pb = dst;
+ count -= 1;
+ l = le32_to_cpu(inl(addr));
+ *pb++ = l >> 24;
+ ps = (u16 *)pb;
+ *ps++ = ((l >> 8) & 0xffff);
+ pi = (u32 *)ps;
+ while(count--) {
+ l2 = le32_to_cpu(inl(addr));
+ *pi++ = (l << 24) | (l2 >> 8);
+ l = l2;
+ }
+ pb = (u8 *)pi;
+ *pb = l;
+ break;
+
+ case 0x3:
+ pb = (u8 *)dst;
+ count -= 1;
+ l = le32_to_cpu(inl(addr));
+ *pb++ = l >> 24;
+ pi = (u32 *)pb;
+ while(count--) {
+ l2 = le32_to_cpu(inl(addr));
+ *pi++ = (l << 8) | (l2 >> 24);
+ l = l2;
+ }
+ ps = (u16 *)pi;
+ *ps++ = ((l >> 8) & 0xffff);
+ pb = (u8 *)ps;
+ *pb = l;
+ break;
+ }
+ }
+ }
+}
+
diff --git a/arch/sparc64/lib/U1copy_from_user.S b/arch/sparc64/lib/U1copy_from_user.S
new file mode 100644
index 00000000000..93146a81e2d
--- /dev/null
+++ b/arch/sparc64/lib/U1copy_from_user.S
@@ -0,0 +1,33 @@
+/* U1copy_from_user.S: UltraSparc-I/II/IIi/IIe optimized copy from userspace.
+ *
+ * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
+ */
+
+#define EX_LD(x) \
+98: x; \
+ .section .fixup; \
+ .align 4; \
+99: retl; \
+ mov 1, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+
+#define FUNC_NAME ___copy_from_user
+#define LOAD(type,addr,dest) type##a [addr] %asi, dest
+#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_AIUS, dest
+#define EX_RETVAL(x) 0
+
+ /* Writing to %asi is _expensive_ so we hardcode it.
+ * Reading %asi to check for KERNEL_DS is comparatively
+ * cheap.
+ */
+#define PREAMBLE \
+ rd %asi, %g1; \
+ cmp %g1, ASI_AIUS; \
+ bne,pn %icc, memcpy_user_stub; \
+ nop; \
+
+#include "U1memcpy.S"
diff --git a/arch/sparc64/lib/U1copy_to_user.S b/arch/sparc64/lib/U1copy_to_user.S
new file mode 100644
index 00000000000..1fccc521e2b
--- /dev/null
+++ b/arch/sparc64/lib/U1copy_to_user.S
@@ -0,0 +1,33 @@
+/* U1copy_to_user.S: UltraSparc-I/II/IIi/IIe optimized copy to userspace.
+ *
+ * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
+ */
+
+#define EX_ST(x) \
+98: x; \
+ .section .fixup; \
+ .align 4; \
+99: retl; \
+ mov 1, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+
+#define FUNC_NAME ___copy_to_user
+#define STORE(type,src,addr) type##a src, [addr] ASI_AIUS
+#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_AIUS
+#define EX_RETVAL(x) 0
+
+ /* Writing to %asi is _expensive_ so we hardcode it.
+ * Reading %asi to check for KERNEL_DS is comparatively
+ * cheap.
+ */
+#define PREAMBLE \
+ rd %asi, %g1; \
+ cmp %g1, ASI_AIUS; \
+ bne,pn %icc, memcpy_user_stub; \
+ nop; \
+
+#include "U1memcpy.S"
diff --git a/arch/sparc64/lib/U1memcpy.S b/arch/sparc64/lib/U1memcpy.S
new file mode 100644
index 00000000000..da9b520c718
--- /dev/null
+++ b/arch/sparc64/lib/U1memcpy.S
@@ -0,0 +1,560 @@
+/* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy.
+ *
+ * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com)
+ * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ */
+
+#ifdef __KERNEL__
+#include <asm/visasm.h>
+#include <asm/asi.h>
+#define GLOBAL_SPARE g7
+#else
+#define GLOBAL_SPARE g5
+#define ASI_BLK_P 0xf0
+#define FPRS_FEF 0x04
+#ifdef MEMCPY_DEBUG
+#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
+ clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
+#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#else
+#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
+#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#endif
+#endif
+
+#ifndef EX_LD
+#define EX_LD(x) x
+#endif
+
+#ifndef EX_ST
+#define EX_ST(x) x
+#endif
+
+#ifndef EX_RETVAL
+#define EX_RETVAL(x) x
+#endif
+
+#ifndef LOAD
+#define LOAD(type,addr,dest) type [addr], dest
+#endif
+
+#ifndef LOAD_BLK
+#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest
+#endif
+
+#ifndef STORE
+#define STORE(type,src,addr) type src, [addr]
+#endif
+
+#ifndef STORE_BLK
+#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
+#endif
+
+#ifndef FUNC_NAME
+#define FUNC_NAME memcpy
+#endif
+
+#ifndef PREAMBLE
+#define PREAMBLE
+#endif
+
+#ifndef XCC
+#define XCC xcc
+#endif
+
+#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \
+ faligndata %f1, %f2, %f48; \
+ faligndata %f2, %f3, %f50; \
+ faligndata %f3, %f4, %f52; \
+ faligndata %f4, %f5, %f54; \
+ faligndata %f5, %f6, %f56; \
+ faligndata %f6, %f7, %f58; \
+ faligndata %f7, %f8, %f60; \
+ faligndata %f8, %f9, %f62;
+
+#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt) \
+ EX_LD(LOAD_BLK(%src, %fdest)); \
+ EX_ST(STORE_BLK(%fsrc, %dest)); \
+ add %src, 0x40, %src; \
+ subcc %len, 0x40, %len; \
+ be,pn %xcc, jmptgt; \
+ add %dest, 0x40, %dest; \
+
+#define LOOP_CHUNK1(src, dest, len, branch_dest) \
+ MAIN_LOOP_CHUNK(src, dest, f0, f48, len, branch_dest)
+#define LOOP_CHUNK2(src, dest, len, branch_dest) \
+ MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest)
+#define LOOP_CHUNK3(src, dest, len, branch_dest) \
+ MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest)
+
+#define STORE_SYNC(dest, fsrc) \
+ EX_ST(STORE_BLK(%fsrc, %dest)); \
+ add %dest, 0x40, %dest;
+
+#define STORE_JUMP(dest, fsrc, target) \
+ EX_ST(STORE_BLK(%fsrc, %dest)); \
+ add %dest, 0x40, %dest; \
+ ba,pt %xcc, target;
+
+#define FINISH_VISCHUNK(dest, f0, f1, left) \
+ subcc %left, 8, %left;\
+ bl,pn %xcc, 95f; \
+ faligndata %f0, %f1, %f48; \
+ EX_ST(STORE(std, %f48, %dest)); \
+ add %dest, 8, %dest;
+
+#define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \
+ subcc %left, 8, %left; \
+ bl,pn %xcc, 95f; \
+ fsrc1 %f0, %f1;
+
+#define UNEVEN_VISCHUNK(dest, f0, f1, left) \
+ UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \
+ ba,a,pt %xcc, 93f;
+
+ .register %g2,#scratch
+ .register %g3,#scratch
+
+ .text
+ .align 64
+
+ .globl FUNC_NAME
+ .type FUNC_NAME,#function
+FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
+ srlx %o2, 31, %g2
+ cmp %g2, 0
+ tne %xcc, 5
+ PREAMBLE
+ mov %o0, %o4
+ cmp %o2, 0
+ be,pn %XCC, 85f
+ or %o0, %o1, %o3
+ cmp %o2, 16
+ blu,a,pn %XCC, 80f
+ or %o3, %o2, %o3
+
+ cmp %o2, (5 * 64)
+ blu,pt %XCC, 70f
+ andcc %o3, 0x7, %g0
+
+ /* Clobbers o5/g1/g2/g3/g7/icc/xcc. */
+ VISEntry
+
+ /* Is 'dst' already aligned on an 64-byte boundary? */
+ andcc %o0, 0x3f, %g2
+ be,pt %XCC, 2f
+
+ /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
+ * of bytes to copy to make 'dst' 64-byte aligned. We pre-
+ * subtract this from 'len'.
+ */
+ sub %o0, %o1, %GLOBAL_SPARE
+ sub %g2, 0x40, %g2
+ sub %g0, %g2, %g2
+ sub %o2, %g2, %o2
+ andcc %g2, 0x7, %g1
+ be,pt %icc, 2f
+ and %g2, 0x38, %g2
+
+1: subcc %g1, 0x1, %g1
+ EX_LD(LOAD(ldub, %o1 + 0x00, %o3))
+ EX_ST(STORE(stb, %o3, %o1 + %GLOBAL_SPARE))
+ bgu,pt %XCC, 1b
+ add %o1, 0x1, %o1
+
+ add %o1, %GLOBAL_SPARE, %o0
+
+2: cmp %g2, 0x0
+ and %o1, 0x7, %g1
+ be,pt %icc, 3f
+ alignaddr %o1, %g0, %o1
+
+ EX_LD(LOAD(ldd, %o1, %f4))
+1: EX_LD(LOAD(ldd, %o1 + 0x8, %f6))
+ add %o1, 0x8, %o1
+ subcc %g2, 0x8, %g2
+ faligndata %f4, %f6, %f0
+ EX_ST(STORE(std, %f0, %o0))
+ be,pn %icc, 3f
+ add %o0, 0x8, %o0
+
+ EX_LD(LOAD(ldd, %o1 + 0x8, %f4))
+ add %o1, 0x8, %o1
+ subcc %g2, 0x8, %g2
+ faligndata %f6, %f4, %f0
+ EX_ST(STORE(std, %f0, %o0))
+ bne,pt %icc, 1b
+ add %o0, 0x8, %o0
+
+ /* Destination is 64-byte aligned. */
+3:
+ membar #LoadStore | #StoreStore | #StoreLoad
+
+ subcc %o2, 0x40, %GLOBAL_SPARE
+ add %o1, %g1, %g1
+ andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE
+ srl %g1, 3, %g2
+ sub %o2, %GLOBAL_SPARE, %g3
+ andn %o1, (0x40 - 1), %o1
+ and %g2, 7, %g2
+ andncc %g3, 0x7, %g3
+ fmovd %f0, %f2
+ sub %g3, 0x8, %g3
+ sub %o2, %GLOBAL_SPARE, %o2
+
+ add %g1, %GLOBAL_SPARE, %g1
+ subcc %o2, %g3, %o2
+
+ EX_LD(LOAD_BLK(%o1, %f0))
+ add %o1, 0x40, %o1
+ add %g1, %g3, %g1
+ EX_LD(LOAD_BLK(%o1, %f16))
+ add %o1, 0x40, %o1
+ sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
+ EX_LD(LOAD_BLK(%o1, %f32))
+ add %o1, 0x40, %o1
+
+ /* There are 8 instances of the unrolled loop,
+ * one for each possible alignment of the
+ * source buffer. Each loop instance is 452
+ * bytes.
+ */
+ sll %g2, 3, %o3
+ sub %o3, %g2, %o3
+ sllx %o3, 4, %o3
+ add %o3, %g2, %o3
+ sllx %o3, 2, %g2
+1: rd %pc, %o3
+ add %o3, %lo(1f - 1b), %o3
+ jmpl %o3 + %g2, %g0
+ nop
+
+ .align 64
+1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+ FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+ FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+ ba,pt %xcc, 1b+4
+ faligndata %f0, %f2, %f48
+1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
+ STORE_JUMP(o0, f48, 40f) membar #Sync
+2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
+ STORE_JUMP(o0, f48, 48f) membar #Sync
+3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
+ STORE_JUMP(o0, f48, 56f) membar #Sync
+
+1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+ FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+ FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+ ba,pt %xcc, 1b+4
+ faligndata %f2, %f4, %f48
+1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
+ STORE_JUMP(o0, f48, 41f) membar #Sync
+2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
+ STORE_JUMP(o0, f48, 49f) membar #Sync
+3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
+ STORE_JUMP(o0, f48, 57f) membar #Sync
+
+1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+ FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+ FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+ ba,pt %xcc, 1b+4
+ faligndata %f4, %f6, %f48
+1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
+ STORE_JUMP(o0, f48, 42f) membar #Sync
+2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
+ STORE_JUMP(o0, f48, 50f) membar #Sync
+3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
+ STORE_JUMP(o0, f48, 58f) membar #Sync
+
+1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+ FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+ FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+ ba,pt %xcc, 1b+4
+ faligndata %f6, %f8, %f48
+1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
+ STORE_JUMP(o0, f48, 43f) membar #Sync
+2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
+ STORE_JUMP(o0, f48, 51f) membar #Sync
+3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
+ STORE_JUMP(o0, f48, 59f) membar #Sync
+
+1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+ FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+ FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+ ba,pt %xcc, 1b+4
+ faligndata %f8, %f10, %f48
+1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
+ STORE_JUMP(o0, f48, 44f) membar #Sync
+2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
+ STORE_JUMP(o0, f48, 52f) membar #Sync
+3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
+ STORE_JUMP(o0, f48, 60f) membar #Sync
+
+1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+ FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+ FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+ ba,pt %xcc, 1b+4
+ faligndata %f10, %f12, %f48
+1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
+ STORE_JUMP(o0, f48, 45f) membar #Sync
+2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
+ STORE_JUMP(o0, f48, 53f) membar #Sync
+3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
+ STORE_JUMP(o0, f48, 61f) membar #Sync
+
+1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+ FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+ FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+ ba,pt %xcc, 1b+4
+ faligndata %f12, %f14, %f48
+1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
+ STORE_JUMP(o0, f48, 46f) membar #Sync
+2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
+ STORE_JUMP(o0, f48, 54f) membar #Sync
+3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
+ STORE_JUMP(o0, f48, 62f) membar #Sync
+
+1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+ FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+ FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+ ba,pt %xcc, 1b+4
+ faligndata %f14, %f16, %f48
+1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
+ STORE_JUMP(o0, f48, 47f) membar #Sync
+2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
+ STORE_JUMP(o0, f48, 55f) membar #Sync
+3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
+ STORE_SYNC(o0, f48) membar #Sync
+ FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
+ STORE_JUMP(o0, f48, 63f) membar #Sync
+
+40: FINISH_VISCHUNK(o0, f0, f2, g3)
+41: FINISH_VISCHUNK(o0, f2, f4, g3)
+42: FINISH_VISCHUNK(o0, f4, f6, g3)
+43: FINISH_VISCHUNK(o0, f6, f8, g3)
+44: FINISH_VISCHUNK(o0, f8, f10, g3)
+45: FINISH_VISCHUNK(o0, f10, f12, g3)
+46: FINISH_VISCHUNK(o0, f12, f14, g3)
+47: UNEVEN_VISCHUNK(o0, f14, f0, g3)
+48: FINISH_VISCHUNK(o0, f16, f18, g3)
+49: FINISH_VISCHUNK(o0, f18, f20, g3)
+50: FINISH_VISCHUNK(o0, f20, f22, g3)
+51: FINISH_VISCHUNK(o0, f22, f24, g3)
+52: FINISH_VISCHUNK(o0, f24, f26, g3)
+53: FINISH_VISCHUNK(o0, f26, f28, g3)
+54: FINISH_VISCHUNK(o0, f28, f30, g3)
+55: UNEVEN_VISCHUNK(o0, f30, f0, g3)
+56: FINISH_VISCHUNK(o0, f32, f34, g3)
+57: FINISH_VISCHUNK(o0, f34, f36, g3)
+58: FINISH_VISCHUNK(o0, f36, f38, g3)
+59: FINISH_VISCHUNK(o0, f38, f40, g3)
+60: FINISH_VISCHUNK(o0, f40, f42, g3)
+61: FINISH_VISCHUNK(o0, f42, f44, g3)
+62: FINISH_VISCHUNK(o0, f44, f46, g3)
+63: UNEVEN_VISCHUNK_LAST(o0, f46, f0, g3)
+
+93: EX_LD(LOAD(ldd, %o1, %f2))
+ add %o1, 8, %o1
+ subcc %g3, 8, %g3
+ faligndata %f0, %f2, %f8
+ EX_ST(STORE(std, %f8, %o0))
+ bl,pn %xcc, 95f
+ add %o0, 8, %o0
+ EX_LD(LOAD(ldd, %o1, %f0))
+ add %o1, 8, %o1
+ subcc %g3, 8, %g3
+ faligndata %f2, %f0, %f8
+ EX_ST(STORE(std, %f8, %o0))
+ bge,pt %xcc, 93b
+ add %o0, 8, %o0
+
+95: brz,pt %o2, 2f
+ mov %g1, %o1
+
+1: EX_LD(LOAD(ldub, %o1, %o3))
+ add %o1, 1, %o1
+ subcc %o2, 1, %o2
+ EX_ST(STORE(stb, %o3, %o0))
+ bne,pt %xcc, 1b
+ add %o0, 1, %o0
+
+2: membar #StoreLoad | #StoreStore
+ VISExit
+ retl
+ mov EX_RETVAL(%o4), %o0
+
+ .align 64
+70: /* 16 < len <= (5 * 64) */
+ bne,pn %XCC, 75f
+ sub %o0, %o1, %o3
+
+72: andn %o2, 0xf, %GLOBAL_SPARE
+ and %o2, 0xf, %o2
+1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5))
+ EX_LD(LOAD(ldx, %o1 + 0x08, %g1))
+ subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE
+ EX_ST(STORE(stx, %o5, %o1 + %o3))
+ add %o1, 0x8, %o1
+ EX_ST(STORE(stx, %g1, %o1 + %o3))
+ bgu,pt %XCC, 1b
+ add %o1, 0x8, %o1
+73: andcc %o2, 0x8, %g0
+ be,pt %XCC, 1f
+ nop
+ EX_LD(LOAD(ldx, %o1, %o5))
+ sub %o2, 0x8, %o2
+ EX_ST(STORE(stx, %o5, %o1 + %o3))
+ add %o1, 0x8, %o1
+1: andcc %o2, 0x4, %g0
+ be,pt %XCC, 1f
+ nop
+ EX_LD(LOAD(lduw, %o1, %o5))
+ sub %o2, 0x4, %o2
+ EX_ST(STORE(stw, %o5, %o1 + %o3))
+ add %o1, 0x4, %o1
+1: cmp %o2, 0
+ be,pt %XCC, 85f
+ nop
+ ba,pt %xcc, 90f
+ nop
+
+75: andcc %o0, 0x7, %g1
+ sub %g1, 0x8, %g1
+ be,pn %icc, 2f
+ sub %g0, %g1, %g1
+ sub %o2, %g1, %o2
+
+1: EX_LD(LOAD(ldub, %o1, %o5))
+ subcc %g1, 1, %g1
+ EX_ST(STORE(stb, %o5, %o1 + %o3))
+ bgu,pt %icc, 1b
+ add %o1, 1, %o1
+
+2: add %o1, %o3, %o0
+ andcc %o1, 0x7, %g1
+ bne,pt %icc, 8f
+ sll %g1, 3, %g1
+
+ cmp %o2, 16
+ bgeu,pt %icc, 72b
+ nop
+ ba,a,pt %xcc, 73b
+
+8: mov 64, %o3
+ andn %o1, 0x7, %o1
+ EX_LD(LOAD(ldx, %o1, %g2))
+ sub %o3, %g1, %o3
+ andn %o2, 0x7, %GLOBAL_SPARE
+ sllx %g2, %g1, %g2
+1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3))
+ subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE
+ add %o1, 0x8, %o1
+ srlx %g3, %o3, %o5
+ or %o5, %g2, %o5
+ EX_ST(STORE(stx, %o5, %o0))
+ add %o0, 0x8, %o0
+ bgu,pt %icc, 1b
+ sllx %g3, %g1, %g2
+
+ srl %g1, 3, %g1
+ andcc %o2, 0x7, %o2
+ be,pn %icc, 85f
+ add %o1, %g1, %o1
+ ba,pt %xcc, 90f
+ sub %o0, %o1, %o3
+
+ .align 64
+80: /* 0 < len <= 16 */
+ andcc %o3, 0x3, %g0