diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/sparc64/lib |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/sparc64/lib')
37 files changed, 4850 insertions, 0 deletions
diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile new file mode 100644 index 00000000000..40dbeec7e5d --- /dev/null +++ b/arch/sparc64/lib/Makefile @@ -0,0 +1,20 @@ +# $Id: Makefile,v 1.25 2000/12/14 22:57:25 davem Exp $ +# Makefile for Sparc64 library files.. +# + +EXTRA_AFLAGS := -ansi +EXTRA_CFLAGS := -Werror + +lib-y := PeeCeeI.o copy_page.o clear_page.o strlen.o strncmp.o \ + memscan.o strncpy_from_user.o strlen_user.o memcmp.o checksum.o \ + bzero.o csum_copy.o csum_copy_from_user.o csum_copy_to_user.o \ + VISsave.o atomic.o bitops.o \ + U1memcpy.o U1copy_from_user.o U1copy_to_user.o \ + U3memcpy.o U3copy_from_user.o U3copy_to_user.o U3patch.o \ + copy_in_user.o user_fixup.o memmove.o \ + mcount.o ipcsum.o rwsem.o xor.o find_bit.o delay.o + +lib-$(CONFIG_DEBUG_SPINLOCK) += debuglocks.o +lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o + +obj-y += iomap.o diff --git a/arch/sparc64/lib/PeeCeeI.c b/arch/sparc64/lib/PeeCeeI.c new file mode 100644 index 00000000000..3008d536e8c --- /dev/null +++ b/arch/sparc64/lib/PeeCeeI.c @@ -0,0 +1,237 @@ +/* $Id: PeeCeeI.c,v 1.4 1999/09/06 01:17:35 davem Exp $ + * PeeCeeI.c: The emerging standard... + * + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + */ + +#include <asm/io.h> +#include <asm/byteorder.h> + +void outsb(void __iomem *addr, const void *src, unsigned long count) +{ + const u8 *p = src; + + while(count--) + outb(*p++, addr); +} + +void outsw(void __iomem *addr, const void *src, unsigned long count) +{ + if(count) { + u16 *ps = (u16 *)src; + u32 *pi; + + if(((u64)src) & 0x2) { + u16 val = le16_to_cpup(ps); + outw(val, addr); + ps++; + count--; + } + pi = (u32 *)ps; + while(count >= 2) { + u32 w = le32_to_cpup(pi); + + pi++; + outw(w >> 0, addr); + outw(w >> 16, addr); + count -= 2; + } + ps = (u16 *)pi; + if(count) { + u16 val = le16_to_cpup(ps); + outw(val, addr); + } + } +} + +void outsl(void __iomem *addr, const void *src, unsigned long count) +{ + if(count) { + if((((u64)src) & 0x3) == 0) { + u32 *p = (u32 *)src; + while(count--) { + u32 val = cpu_to_le32p(p); + outl(val, addr); + p++; + } + } else { + u8 *pb; + u16 *ps = (u16 *)src; + u32 l = 0, l2; + u32 *pi; + + switch(((u64)src) & 0x3) { + case 0x2: + count -= 1; + l = cpu_to_le16p(ps) << 16; + ps++; + pi = (u32 *)ps; + while(count--) { + l2 = cpu_to_le32p(pi); + pi++; + outl(((l >> 16) | (l2 << 16)), addr); + l = l2; + } + ps = (u16 *)pi; + l2 = cpu_to_le16p(ps); + outl(((l >> 16) | (l2 << 16)), addr); + break; + + case 0x1: + count -= 1; + pb = (u8 *)src; + l = (*pb++ << 8); + ps = (u16 *)pb; + l2 = cpu_to_le16p(ps); + ps++; + l |= (l2 << 16); + pi = (u32 *)ps; + while(count--) { + l2 = cpu_to_le32p(pi); + pi++; + outl(((l >> 8) | (l2 << 24)), addr); + l = l2; + } + pb = (u8 *)pi; + outl(((l >> 8) | (*pb << 24)), addr); + break; + + case 0x3: + count -= 1; + pb = (u8 *)src; + l = (*pb++ << 24); + pi = (u32 *)pb; + while(count--) { + l2 = cpu_to_le32p(pi); + pi++; + outl(((l >> 24) | (l2 << 8)), addr); + l = l2; + } + ps = (u16 *)pi; + l2 = cpu_to_le16p(ps); + ps++; + pb = (u8 *)ps; + l2 |= (*pb << 16); + outl(((l >> 24) | (l2 << 8)), addr); + break; + } + } + } +} + +void insb(void __iomem *addr, void *dst, unsigned long count) +{ + if(count) { + u32 *pi; + u8 *pb = dst; + + while((((unsigned long)pb) & 0x3) && count--) + *pb++ = inb(addr); + pi = (u32 *)pb; + while(count >= 4) { + u32 w; + + w = (inb(addr) << 24); + w |= (inb(addr) << 16); + w |= (inb(addr) << 8); + w |= (inb(addr) << 0); + *pi++ = w; + count -= 4; + } + pb = (u8 *)pi; + while(count--) + *pb++ = inb(addr); + } +} + +void insw(void __iomem *addr, void *dst, unsigned long count) +{ + if(count) { + u16 *ps = dst; + u32 *pi; + + if(((unsigned long)ps) & 0x2) { + *ps++ = le16_to_cpu(inw(addr)); + count--; + } + pi = (u32 *)ps; + while(count >= 2) { + u32 w; + + w = (le16_to_cpu(inw(addr)) << 16); + w |= (le16_to_cpu(inw(addr)) << 0); + *pi++ = w; + count -= 2; + } + ps = (u16 *)pi; + if(count) + *ps = le16_to_cpu(inw(addr)); + } +} + +void insl(void __iomem *addr, void *dst, unsigned long count) +{ + if(count) { + if((((unsigned long)dst) & 0x3) == 0) { + u32 *pi = dst; + while(count--) + *pi++ = le32_to_cpu(inl(addr)); + } else { + u32 l = 0, l2, *pi; + u16 *ps; + u8 *pb; + + switch(((unsigned long)dst) & 3) { + case 0x2: + ps = dst; + count -= 1; + l = le32_to_cpu(inl(addr)); + *ps++ = l; + pi = (u32 *)ps; + while(count--) { + l2 = le32_to_cpu(inl(addr)); + *pi++ = (l << 16) | (l2 >> 16); + l = l2; + } + ps = (u16 *)pi; + *ps = l; + break; + + case 0x1: + pb = dst; + count -= 1; + l = le32_to_cpu(inl(addr)); + *pb++ = l >> 24; + ps = (u16 *)pb; + *ps++ = ((l >> 8) & 0xffff); + pi = (u32 *)ps; + while(count--) { + l2 = le32_to_cpu(inl(addr)); + *pi++ = (l << 24) | (l2 >> 8); + l = l2; + } + pb = (u8 *)pi; + *pb = l; + break; + + case 0x3: + pb = (u8 *)dst; + count -= 1; + l = le32_to_cpu(inl(addr)); + *pb++ = l >> 24; + pi = (u32 *)pb; + while(count--) { + l2 = le32_to_cpu(inl(addr)); + *pi++ = (l << 8) | (l2 >> 24); + l = l2; + } + ps = (u16 *)pi; + *ps++ = ((l >> 8) & 0xffff); + pb = (u8 *)ps; + *pb = l; + break; + } + } + } +} + diff --git a/arch/sparc64/lib/U1copy_from_user.S b/arch/sparc64/lib/U1copy_from_user.S new file mode 100644 index 00000000000..93146a81e2d --- /dev/null +++ b/arch/sparc64/lib/U1copy_from_user.S @@ -0,0 +1,33 @@ +/* U1copy_from_user.S: UltraSparc-I/II/IIi/IIe optimized copy from userspace. + * + * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) + */ + +#define EX_LD(x) \ +98: x; \ + .section .fixup; \ + .align 4; \ +99: retl; \ + mov 1, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; + +#define FUNC_NAME ___copy_from_user +#define LOAD(type,addr,dest) type##a [addr] %asi, dest +#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_AIUS, dest +#define EX_RETVAL(x) 0 + + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, memcpy_user_stub; \ + nop; \ + +#include "U1memcpy.S" diff --git a/arch/sparc64/lib/U1copy_to_user.S b/arch/sparc64/lib/U1copy_to_user.S new file mode 100644 index 00000000000..1fccc521e2b --- /dev/null +++ b/arch/sparc64/lib/U1copy_to_user.S @@ -0,0 +1,33 @@ +/* U1copy_to_user.S: UltraSparc-I/II/IIi/IIe optimized copy to userspace. + * + * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) + */ + +#define EX_ST(x) \ +98: x; \ + .section .fixup; \ + .align 4; \ +99: retl; \ + mov 1, %o0; \ + .section __ex_table; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; + +#define FUNC_NAME ___copy_to_user +#define STORE(type,src,addr) type##a src, [addr] ASI_AIUS +#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_AIUS +#define EX_RETVAL(x) 0 + + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, memcpy_user_stub; \ + nop; \ + +#include "U1memcpy.S" diff --git a/arch/sparc64/lib/U1memcpy.S b/arch/sparc64/lib/U1memcpy.S new file mode 100644 index 00000000000..da9b520c718 --- /dev/null +++ b/arch/sparc64/lib/U1memcpy.S @@ -0,0 +1,560 @@ +/* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy. + * + * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com) + * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz) + */ + +#ifdef __KERNEL__ +#include <asm/visasm.h> +#include <asm/asi.h> +#define GLOBAL_SPARE g7 +#else +#define GLOBAL_SPARE g5 +#define ASI_BLK_P 0xf0 +#define FPRS_FEF 0x04 +#ifdef MEMCPY_DEBUG +#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ + clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; +#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#else +#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs +#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#endif +#endif + +#ifndef EX_LD +#define EX_LD(x) x +#endif + +#ifndef EX_ST +#define EX_ST(x) x +#endif + +#ifndef EX_RETVAL +#define EX_RETVAL(x) x +#endif + +#ifndef LOAD +#define LOAD(type,addr,dest) type [addr], dest +#endif + +#ifndef LOAD_BLK +#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest +#endif + +#ifndef STORE +#define STORE(type,src,addr) type src, [addr] +#endif + +#ifndef STORE_BLK +#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P +#endif + +#ifndef FUNC_NAME +#define FUNC_NAME memcpy +#endif + +#ifndef PREAMBLE +#define PREAMBLE +#endif + +#ifndef XCC +#define XCC xcc +#endif + +#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \ + faligndata %f1, %f2, %f48; \ + faligndata %f2, %f3, %f50; \ + faligndata %f3, %f4, %f52; \ + faligndata %f4, %f5, %f54; \ + faligndata %f5, %f6, %f56; \ + faligndata %f6, %f7, %f58; \ + faligndata %f7, %f8, %f60; \ + faligndata %f8, %f9, %f62; + +#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt) \ + EX_LD(LOAD_BLK(%src, %fdest)); \ + EX_ST(STORE_BLK(%fsrc, %dest)); \ + add %src, 0x40, %src; \ + subcc %len, 0x40, %len; \ + be,pn %xcc, jmptgt; \ + add %dest, 0x40, %dest; \ + +#define LOOP_CHUNK1(src, dest, len, branch_dest) \ + MAIN_LOOP_CHUNK(src, dest, f0, f48, len, branch_dest) +#define LOOP_CHUNK2(src, dest, len, branch_dest) \ + MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest) +#define LOOP_CHUNK3(src, dest, len, branch_dest) \ + MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest) + +#define STORE_SYNC(dest, fsrc) \ + EX_ST(STORE_BLK(%fsrc, %dest)); \ + add %dest, 0x40, %dest; + +#define STORE_JUMP(dest, fsrc, target) \ + EX_ST(STORE_BLK(%fsrc, %dest)); \ + add %dest, 0x40, %dest; \ + ba,pt %xcc, target; + +#define FINISH_VISCHUNK(dest, f0, f1, left) \ + subcc %left, 8, %left;\ + bl,pn %xcc, 95f; \ + faligndata %f0, %f1, %f48; \ + EX_ST(STORE(std, %f48, %dest)); \ + add %dest, 8, %dest; + +#define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ + subcc %left, 8, %left; \ + bl,pn %xcc, 95f; \ + fsrc1 %f0, %f1; + +#define UNEVEN_VISCHUNK(dest, f0, f1, left) \ + UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ + ba,a,pt %xcc, 93f; + + .register %g2,#scratch + .register %g3,#scratch + + .text + .align 64 + + .globl FUNC_NAME + .type FUNC_NAME,#function +FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ + srlx %o2, 31, %g2 + cmp %g2, 0 + tne %xcc, 5 + PREAMBLE + mov %o0, %o4 + cmp %o2, 0 + be,pn %XCC, 85f + or %o0, %o1, %o3 + cmp %o2, 16 + blu,a,pn %XCC, 80f + or %o3, %o2, %o3 + + cmp %o2, (5 * 64) + blu,pt %XCC, 70f + andcc %o3, 0x7, %g0 + + /* Clobbers o5/g1/g2/g3/g7/icc/xcc. */ + VISEntry + + /* Is 'dst' already aligned on an 64-byte boundary? */ + andcc %o0, 0x3f, %g2 + be,pt %XCC, 2f + + /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number + * of bytes to copy to make 'dst' 64-byte aligned. We pre- + * subtract this from 'len'. + */ + sub %o0, %o1, %GLOBAL_SPARE + sub %g2, 0x40, %g2 + sub %g0, %g2, %g2 + sub %o2, %g2, %o2 + andcc %g2, 0x7, %g1 + be,pt %icc, 2f + and %g2, 0x38, %g2 + +1: subcc %g1, 0x1, %g1 + EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) + EX_ST(STORE(stb, %o3, %o1 + %GLOBAL_SPARE)) + bgu,pt %XCC, 1b + add %o1, 0x1, %o1 + + add %o1, %GLOBAL_SPARE, %o0 + +2: cmp %g2, 0x0 + and %o1, 0x7, %g1 + be,pt %icc, 3f + alignaddr %o1, %g0, %o1 + + EX_LD(LOAD(ldd, %o1, %f4)) +1: EX_LD(LOAD(ldd, %o1 + 0x8, %f6)) + add %o1, 0x8, %o1 + subcc %g2, 0x8, %g2 + faligndata %f4, %f6, %f0 + EX_ST(STORE(std, %f0, %o0)) + be,pn %icc, 3f + add %o0, 0x8, %o0 + + EX_LD(LOAD(ldd, %o1 + 0x8, %f4)) + add %o1, 0x8, %o1 + subcc %g2, 0x8, %g2 + faligndata %f6, %f4, %f0 + EX_ST(STORE(std, %f0, %o0)) + bne,pt %icc, 1b + add %o0, 0x8, %o0 + + /* Destination is 64-byte aligned. */ +3: + membar #LoadStore | #StoreStore | #StoreLoad + + subcc %o2, 0x40, %GLOBAL_SPARE + add %o1, %g1, %g1 + andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE + srl %g1, 3, %g2 + sub %o2, %GLOBAL_SPARE, %g3 + andn %o1, (0x40 - 1), %o1 + and %g2, 7, %g2 + andncc %g3, 0x7, %g3 + fmovd %f0, %f2 + sub %g3, 0x8, %g3 + sub %o2, %GLOBAL_SPARE, %o2 + + add %g1, %GLOBAL_SPARE, %g1 + subcc %o2, %g3, %o2 + + EX_LD(LOAD_BLK(%o1, %f0)) + add %o1, 0x40, %o1 + add %g1, %g3, %g1 + EX_LD(LOAD_BLK(%o1, %f16)) + add %o1, 0x40, %o1 + sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE + EX_LD(LOAD_BLK(%o1, %f32)) + add %o1, 0x40, %o1 + + /* There are 8 instances of the unrolled loop, + * one for each possible alignment of the + * source buffer. Each loop instance is 452 + * bytes. + */ + sll %g2, 3, %o3 + sub %o3, %g2, %o3 + sllx %o3, 4, %o3 + add %o3, %g2, %o3 + sllx %o3, 2, %g2 +1: rd %pc, %o3 + add %o3, %lo(1f - 1b), %o3 + jmpl %o3 + %g2, %g0 + nop + + .align 64 +1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f0, %f2, %f48 +1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) + STORE_JUMP(o0, f48, 40f) membar #Sync +2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) + STORE_JUMP(o0, f48, 48f) membar #Sync +3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) + STORE_JUMP(o0, f48, 56f) membar #Sync + +1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f2, %f4, %f48 +1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) + STORE_JUMP(o0, f48, 41f) membar #Sync +2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) + STORE_JUMP(o0, f48, 49f) membar #Sync +3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) + STORE_JUMP(o0, f48, 57f) membar #Sync + +1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f4, %f6, %f48 +1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) + STORE_JUMP(o0, f48, 42f) membar #Sync +2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) + STORE_JUMP(o0, f48, 50f) membar #Sync +3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) + STORE_JUMP(o0, f48, 58f) membar #Sync + +1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f6, %f8, %f48 +1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) + STORE_JUMP(o0, f48, 43f) membar #Sync +2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) + STORE_JUMP(o0, f48, 51f) membar #Sync +3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) + STORE_JUMP(o0, f48, 59f) membar #Sync + +1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f8, %f10, %f48 +1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) + STORE_JUMP(o0, f48, 44f) membar #Sync +2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) + STORE_JUMP(o0, f48, 52f) membar #Sync +3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) + STORE_JUMP(o0, f48, 60f) membar #Sync + +1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f10, %f12, %f48 +1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) + STORE_JUMP(o0, f48, 45f) membar #Sync +2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) + STORE_JUMP(o0, f48, 53f) membar #Sync +3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) + STORE_JUMP(o0, f48, 61f) membar #Sync + +1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f12, %f14, %f48 +1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) + STORE_JUMP(o0, f48, 46f) membar #Sync +2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) + STORE_JUMP(o0, f48, 54f) membar #Sync +3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) + STORE_JUMP(o0, f48, 62f) membar #Sync + +1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f14, %f16, %f48 +1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) + STORE_JUMP(o0, f48, 47f) membar #Sync +2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) + STORE_JUMP(o0, f48, 55f) membar #Sync +3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) + STORE_SYNC(o0, f48) membar #Sync + FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) + STORE_JUMP(o0, f48, 63f) membar #Sync + +40: FINISH_VISCHUNK(o0, f0, f2, g3) +41: FINISH_VISCHUNK(o0, f2, f4, g3) +42: FINISH_VISCHUNK(o0, f4, f6, g3) +43: FINISH_VISCHUNK(o0, f6, f8, g3) +44: FINISH_VISCHUNK(o0, f8, f10, g3) +45: FINISH_VISCHUNK(o0, f10, f12, g3) +46: FINISH_VISCHUNK(o0, f12, f14, g3) +47: UNEVEN_VISCHUNK(o0, f14, f0, g3) +48: FINISH_VISCHUNK(o0, f16, f18, g3) +49: FINISH_VISCHUNK(o0, f18, f20, g3) +50: FINISH_VISCHUNK(o0, f20, f22, g3) +51: FINISH_VISCHUNK(o0, f22, f24, g3) +52: FINISH_VISCHUNK(o0, f24, f26, g3) +53: FINISH_VISCHUNK(o0, f26, f28, g3) +54: FINISH_VISCHUNK(o0, f28, f30, g3) +55: UNEVEN_VISCHUNK(o0, f30, f0, g3) +56: FINISH_VISCHUNK(o0, f32, f34, g3) +57: FINISH_VISCHUNK(o0, f34, f36, g3) +58: FINISH_VISCHUNK(o0, f36, f38, g3) +59: FINISH_VISCHUNK(o0, f38, f40, g3) +60: FINISH_VISCHUNK(o0, f40, f42, g3) +61: FINISH_VISCHUNK(o0, f42, f44, g3) +62: FINISH_VISCHUNK(o0, f44, f46, g3) +63: UNEVEN_VISCHUNK_LAST(o0, f46, f0, g3) + +93: EX_LD(LOAD(ldd, %o1, %f2)) + add %o1, 8, %o1 + subcc %g3, 8, %g3 + faligndata %f0, %f2, %f8 + EX_ST(STORE(std, %f8, %o0)) + bl,pn %xcc, 95f + add %o0, 8, %o0 + EX_LD(LOAD(ldd, %o1, %f0)) + add %o1, 8, %o1 + subcc %g3, 8, %g3 + faligndata %f2, %f0, %f8 + EX_ST(STORE(std, %f8, %o0)) + bge,pt %xcc, 93b + add %o0, 8, %o0 + +95: brz,pt %o2, 2f + mov %g1, %o1 + +1: EX_LD(LOAD(ldub, %o1, %o3)) + add %o1, 1, %o1 + subcc %o2, 1, %o2 + EX_ST(STORE(stb, %o3, %o0)) + bne,pt %xcc, 1b + add %o0, 1, %o0 + +2: membar #StoreLoad | #StoreStore + VISExit + retl + mov EX_RETVAL(%o4), %o0 + + .align 64 +70: /* 16 < len <= (5 * 64) */ + bne,pn %XCC, 75f + sub %o0, %o1, %o3 + +72: andn %o2, 0xf, %GLOBAL_SPARE + and %o2, 0xf, %o2 +1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) + EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) + subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE + EX_ST(STORE(stx, %o5, %o1 + %o3)) + add %o1, 0x8, %o1 + EX_ST(STORE(stx, %g1, %o1 + %o3)) + bgu,pt %XCC, 1b + add %o1, 0x8, %o1 +73: andcc %o2, 0x8, %g0 + be,pt %XCC, 1f + nop + EX_LD(LOAD(ldx, %o1, %o5)) + sub %o2, 0x8, %o2 + EX_ST(STORE(stx, %o5, %o1 + %o3)) + add %o1, 0x8, %o1 +1: andcc %o2, 0x4, %g0 + be,pt %XCC, 1f + nop + EX_LD(LOAD(lduw, %o1, %o5)) + sub %o2, 0x4, %o2 + EX_ST(STORE(stw, %o5, %o1 + %o3)) + add %o1, 0x4, %o1 +1: cmp %o2, 0 + be,pt %XCC, 85f + nop + ba,pt %xcc, 90f + nop + +75: andcc %o0, 0x7, %g1 + sub %g1, 0x8, %g1 + be,pn %icc, 2f + sub %g0, %g1, %g1 + sub %o2, %g1, %o2 + +1: EX_LD(LOAD(ldub, %o1, %o5)) + subcc %g1, 1, %g1 + EX_ST(STORE(stb, %o5, %o1 + %o3)) + bgu,pt %icc, 1b + add %o1, 1, %o1 + +2: add %o1, %o3, %o0 + andcc %o1, 0x7, %g1 + bne,pt %icc, 8f + sll %g1, 3, %g1 + + cmp %o2, 16 + bgeu,pt %icc, 72b + nop + ba,a,pt %xcc, 73b + +8: mov 64, %o3 + andn %o1, 0x7, %o1 + EX_LD(LOAD(ldx, %o1, %g2)) + sub %o3, %g1, %o3 + andn %o2, 0x7, %GLOBAL_SPARE + sllx %g2, %g1, %g2 +1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) + subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE + add %o1, 0x8, %o1 + srlx %g3, %o3, %o5 + or %o5, %g2, %o5 + EX_ST(STORE(stx, %o5, %o0)) + add %o0, 0x8, %o0 + bgu,pt %icc, 1b + sllx %g3, %g1, %g2 + + srl %g1, 3, %g1 + andcc %o2, 0x7, %o2 + be,pn %icc, 85f + add %o1, %g1, %o1 + ba,pt %xcc, 90f + sub %o0, %o1, %o3 + + .align 64 +80: /* 0 < len <= 16 */ + andcc %o3, 0x3, %g0 |