diff options
Diffstat (limited to 'arch/sparc/lib')
82 files changed, 7057 insertions, 3395 deletions
diff --git a/arch/sparc/lib/GENbzero.S b/arch/sparc/lib/GENbzero.S new file mode 100644 index 00000000000..8e7a843ddd8 --- /dev/null +++ b/arch/sparc/lib/GENbzero.S @@ -0,0 +1,156 @@ +/* GENbzero.S: Generic sparc64 memset/clear_user. + * + * Copyright (C) 2007 David S. Miller (davem@davemloft.net) + */ +#include <asm/asi.h> + +#define EX_ST(x,y) \ +98: x,y; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_o1; \ + .text; \ + .align 4; + + .align 32 + .text + + .globl GENmemset + .type GENmemset, #function +GENmemset: /* %o0=buf, %o1=pat, %o2=len */ + and %o1, 0xff, %o3 + mov %o2, %o1 + sllx %o3, 8, %g1 + or %g1, %o3, %o2 + sllx %o2, 16, %g1 + or %g1, %o2, %o2 + sllx %o2, 32, %g1 + ba,pt %xcc, 1f + or %g1, %o2, %o2 + + .globl GENbzero + .type GENbzero, #function +GENbzero: + clr %o2 +1: brz,pn %o1, GENbzero_return + mov %o0, %o3 + + /* %o5: saved %asi, restored at GENbzero_done + * %o4: store %asi to use + */ + rd %asi, %o5 + mov ASI_P, %o4 + wr %o4, 0x0, %asi + +GENbzero_from_clear_user: + cmp %o1, 15 + bl,pn %icc, GENbzero_tiny + andcc %o0, 0x7, %g1 + be,pt %xcc, 2f + mov 8, %g2 + sub %g2, %g1, %g1 + sub %o1, %g1, %o1 +1: EX_ST(stba %o2, [%o0 + 0x00] %asi) + subcc %g1, 1, %g1 + bne,pt %xcc, 1b + add %o0, 1, %o0 +2: cmp %o1, 128 + bl,pn %icc, GENbzero_medium + andcc %o0, (64 - 1), %g1 + be,pt %xcc, GENbzero_pre_loop + mov 64, %g2 + sub %g2, %g1, %g1 + sub %o1, %g1, %o1 +1: EX_ST(stxa %o2, [%o0 + 0x00] %asi) + subcc %g1, 8, %g1 + bne,pt %xcc, 1b + add %o0, 8, %o0 + +GENbzero_pre_loop: + andn %o1, (64 - 1), %g1 + sub %o1, %g1, %o1 +GENbzero_loop: + EX_ST(stxa %o2, [%o0 + 0x00] %asi) + EX_ST(stxa %o2, [%o0 + 0x08] %asi) + EX_ST(stxa %o2, [%o0 + 0x10] %asi) + EX_ST(stxa %o2, [%o0 + 0x18] %asi) + EX_ST(stxa %o2, [%o0 + 0x20] %asi) + EX_ST(stxa %o2, [%o0 + 0x28] %asi) + EX_ST(stxa %o2, [%o0 + 0x30] %asi) + EX_ST(stxa %o2, [%o0 + 0x38] %asi) + subcc %g1, 64, %g1 + bne,pt %xcc, GENbzero_loop + add %o0, 64, %o0 + + membar #Sync + wr %o4, 0x0, %asi + brz,pn %o1, GENbzero_done +GENbzero_medium: + andncc %o1, 0x7, %g1 + be,pn %xcc, 2f + sub %o1, %g1, %o1 +1: EX_ST(stxa %o2, [%o0 + 0x00] %asi) + subcc %g1, 8, %g1 + bne,pt %xcc, 1b + add %o0, 8, %o0 +2: brz,pt %o1, GENbzero_done + nop + +GENbzero_tiny: +1: EX_ST(stba %o2, [%o0 + 0x00] %asi) + subcc %o1, 1, %o1 + bne,pt %icc, 1b + add %o0, 1, %o0 + + /* fallthrough */ + +GENbzero_done: + wr %o5, 0x0, %asi + +GENbzero_return: + retl + mov %o3, %o0 + .size GENbzero, .-GENbzero + .size GENmemset, .-GENmemset + + .globl GENclear_user + .type GENclear_user, #function +GENclear_user: /* %o0=buf, %o1=len */ + rd %asi, %o5 + brz,pn %o1, GENbzero_done + clr %o3 + cmp %o5, ASI_AIUS + bne,pn %icc, GENbzero + clr %o2 + ba,pt %xcc, GENbzero_from_clear_user + mov ASI_AIUS, %o4 + .size GENclear_user, .-GENclear_user + +#define BRANCH_ALWAYS 0x10680000 +#define NOP 0x01000000 +#define GEN_DO_PATCH(OLD, NEW) \ + sethi %hi(NEW), %g1; \ + or %g1, %lo(NEW), %g1; \ + sethi %hi(OLD), %g2; \ + or %g2, %lo(OLD), %g2; \ + sub %g1, %g2, %g1; \ + sethi %hi(BRANCH_ALWAYS), %g3; \ + sll %g1, 11, %g1; \ + srl %g1, 11 + 2, %g1; \ + or %g3, %lo(BRANCH_ALWAYS), %g3; \ + or %g3, %g1, %g3; \ + stw %g3, [%g2]; \ + sethi %hi(NOP), %g3; \ + or %g3, %lo(NOP), %g3; \ + stw %g3, [%g2 + 0x4]; \ + flush %g2; + + .globl generic_patch_bzero + .type generic_patch_bzero,#function +generic_patch_bzero: + GEN_DO_PATCH(memset, GENmemset) + GEN_DO_PATCH(__bzero, GENbzero) + GEN_DO_PATCH(__clear_user, GENclear_user) + retl + nop + .size generic_patch_bzero,.-generic_patch_bzero diff --git a/arch/sparc/lib/GENcopy_from_user.S b/arch/sparc/lib/GENcopy_from_user.S new file mode 100644 index 00000000000..b7d0bd6b140 --- /dev/null +++ b/arch/sparc/lib/GENcopy_from_user.S @@ -0,0 +1,30 @@ +/* GENcopy_from_user.S: Generic sparc64 copy from userspace. + * + * Copyright (C) 2007 David S. Miller (davem@davemloft.net) + */ + +#define EX_LD(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one; \ + .text; \ + .align 4; + +#ifndef ASI_AIUS +#define ASI_AIUS 0x11 +#endif + +#define FUNC_NAME GENcopy_from_user +#define LOAD(type,addr,dest) type##a [addr] ASI_AIUS, dest +#define EX_RETVAL(x) 0 + +#ifdef __KERNEL__ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop +#endif + +#include "GENmemcpy.S" diff --git a/arch/sparc/lib/GENcopy_to_user.S b/arch/sparc/lib/GENcopy_to_user.S new file mode 100644 index 00000000000..780550e1afc --- /dev/null +++ b/arch/sparc/lib/GENcopy_to_user.S @@ -0,0 +1,34 @@ +/* GENcopy_to_user.S: Generic sparc64 copy to userspace. + * + * Copyright (C) 2007 David S. Miller (davem@davemloft.net) + */ + +#define EX_ST(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one; \ + .text; \ + .align 4; + +#ifndef ASI_AIUS +#define ASI_AIUS 0x11 +#endif + +#define FUNC_NAME GENcopy_to_user +#define STORE(type,src,addr) type##a src, [addr] ASI_AIUS +#define EX_RETVAL(x) 0 + +#ifdef __KERNEL__ + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop +#endif + +#include "GENmemcpy.S" diff --git a/arch/sparc/lib/GENmemcpy.S b/arch/sparc/lib/GENmemcpy.S new file mode 100644 index 00000000000..89358ee9485 --- /dev/null +++ b/arch/sparc/lib/GENmemcpy.S @@ -0,0 +1,121 @@ +/* GENmemcpy.S: Generic sparc64 memcpy. + * + * Copyright (C) 2007 David S. Miller (davem@davemloft.net) + */ + +#ifdef __KERNEL__ +#define GLOBAL_SPARE %g7 +#else +#define GLOBAL_SPARE %g5 +#endif + +#ifndef EX_LD +#define EX_LD(x) x +#endif + +#ifndef EX_ST +#define EX_ST(x) x +#endif + +#ifndef EX_RETVAL +#define EX_RETVAL(x) x +#endif + +#ifndef LOAD +#define LOAD(type,addr,dest) type [addr], dest +#endif + +#ifndef STORE +#define STORE(type,src,addr) type src, [addr] +#endif + +#ifndef FUNC_NAME +#define FUNC_NAME GENmemcpy +#endif + +#ifndef PREAMBLE +#define PREAMBLE +#endif + +#ifndef XCC +#define XCC xcc +#endif + + .register %g2,#scratch + .register %g3,#scratch + + .text + .align 64 + + .globl FUNC_NAME + .type FUNC_NAME,#function +FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ + srlx %o2, 31, %g2 + cmp %g2, 0 + tne %XCC, 5 + PREAMBLE + mov %o0, GLOBAL_SPARE + + cmp %o2, 0 + be,pn %XCC, 85f + or %o0, %o1, %o3 + cmp %o2, 16 + blu,a,pn %XCC, 80f + or %o3, %o2, %o3 + + xor %o0, %o1, %o4 + andcc %o4, 0x7, %g0 + bne,a,pn %XCC, 90f + sub %o0, %o1, %o3 + + and %o0, 0x7, %o4 + sub %o4, 0x8, %o4 + sub %g0, %o4, %o4 + sub %o2, %o4, %o2 +1: subcc %o4, 1, %o4 + EX_LD(LOAD(ldub, %o1, %g1)) + EX_ST(STORE(stb, %g1, %o0)) + add %o1, 1, %o1 + bne,pt %XCC, 1b + add %o0, 1, %o0 + + andn %o2, 0x7, %g1 + sub %o2, %g1, %o2 +1: subcc %g1, 0x8, %g1 + EX_LD(LOAD(ldx, %o1, %g2)) + EX_ST(STORE(stx, %g2, %o0)) + add %o1, 0x8, %o1 + bne,pt %XCC, 1b + add %o0, 0x8, %o0 + + brz,pt %o2, 85f + sub %o0, %o1, %o3 + ba,a,pt %XCC, 90f + + .align 64 +80: /* 0 < len <= 16 */ + andcc %o3, 0x3, %g0 + bne,pn %XCC, 90f + sub %o0, %o1, %o3 + +1: + subcc %o2, 4, %o2 + EX_LD(LOAD(lduw, %o1, %g1)) + EX_ST(STORE(stw, %g1, %o1 + %o3)) + bgu,pt %XCC, 1b + add %o1, 4, %o1 + +85: retl + mov EX_RETVAL(GLOBAL_SPARE), %o0 + + .align 32 +90: + subcc %o2, 1, %o2 + EX_LD(LOAD(ldub, %o1, %g1)) + EX_ST(STORE(stb, %g1, %o1 + %o3)) + bgu,pt %XCC, 90b + add %o1, 1, %o1 + retl + mov EX_RETVAL(GLOBAL_SPARE), %o0 + + .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc/lib/GENpage.S b/arch/sparc/lib/GENpage.S new file mode 100644 index 00000000000..2ef9d05f21b --- /dev/null +++ b/arch/sparc/lib/GENpage.S @@ -0,0 +1,77 @@ +/* GENpage.S: Generic clear and copy page. + * + * Copyright (C) 2007 (davem@davemloft.net) + */ +#include <asm/page.h> + + .text + .align 32 + +GENcopy_user_page: + set PAGE_SIZE, %g7 +1: ldx [%o1 + 0x00], %o2 + ldx [%o1 + 0x08], %o3 + ldx [%o1 + 0x10], %o4 + ldx [%o1 + 0x18], %o5 + stx %o2, [%o0 + 0x00] + stx %o3, [%o0 + 0x08] + stx %o4, [%o0 + 0x10] + stx %o5, [%o0 + 0x18] + ldx [%o1 + 0x20], %o2 + ldx [%o1 + 0x28], %o3 + ldx [%o1 + 0x30], %o4 + ldx [%o1 + 0x38], %o5 + stx %o2, [%o0 + 0x20] + stx %o3, [%o0 + 0x28] + stx %o4, [%o0 + 0x30] + stx %o5, [%o0 + 0x38] + subcc %g7, 64, %g7 + add %o1, 64, %o1 + bne,pt %xcc, 1b + add %o0, 64, %o0 + retl + nop + +GENclear_page: +GENclear_user_page: + set PAGE_SIZE, %g7 +1: stx %g0, [%o0 + 0x00] + stx %g0, [%o0 + 0x08] + stx %g0, [%o0 + 0x10] + stx %g0, [%o0 + 0x18] + stx %g0, [%o0 + 0x20] + stx %g0, [%o0 + 0x28] + stx %g0, [%o0 + 0x30] + stx %g0, [%o0 + 0x38] + subcc %g7, 64, %g7 + bne,pt %xcc, 1b + add %o0, 64, %o0 + +#define BRANCH_ALWAYS 0x10680000 +#define NOP 0x01000000 +#define GEN_DO_PATCH(OLD, NEW) \ + sethi %hi(NEW), %g1; \ + or %g1, %lo(NEW), %g1; \ + sethi %hi(OLD), %g2; \ + or %g2, %lo(OLD), %g2; \ + sub %g1, %g2, %g1; \ + sethi %hi(BRANCH_ALWAYS), %g3; \ + sll %g1, 11, %g1; \ + srl %g1, 11 + 2, %g1; \ + or %g3, %lo(BRANCH_ALWAYS), %g3; \ + or %g3, %g1, %g3; \ + stw %g3, [%g2]; \ + sethi %hi(NOP), %g3; \ + or %g3, %lo(NOP), %g3; \ + stw %g3, [%g2 + 0x4]; \ + flush %g2; + + .globl generic_patch_pageops + .type generic_patch_pageops,#function +generic_patch_pageops: + GEN_DO_PATCH(copy_user_page, GENcopy_user_page) + GEN_DO_PATCH(_clear_page, GENclear_page) + GEN_DO_PATCH(clear_user_page, GENclear_user_page) + retl + nop + .size generic_patch_pageops,.-generic_patch_pageops diff --git a/arch/sparc/lib/GENpatch.S b/arch/sparc/lib/GENpatch.S new file mode 100644 index 00000000000..fab9e89f16b --- /dev/null +++ b/arch/sparc/lib/GENpatch.S @@ -0,0 +1,33 @@ +/* GENpatch.S: Patch Ultra-I routines with generic variant. + * + * Copyright (C) 2007 David S. Miller <davem@davemloft.net> + */ + +#define BRANCH_ALWAYS 0x10680000 +#define NOP 0x01000000 +#define GEN_DO_PATCH(OLD, NEW) \ + sethi %hi(NEW), %g1; \ + or %g1, %lo(NEW), %g1; \ + sethi %hi(OLD), %g2; \ + or %g2, %lo(OLD), %g2; \ + sub %g1, %g2, %g1; \ + sethi %hi(BRANCH_ALWAYS), %g3; \ + sll %g1, 11, %g1; \ + srl %g1, 11 + 2, %g1; \ + or %g3, %lo(BRANCH_ALWAYS), %g3; \ + or %g3, %g1, %g3; \ + stw %g3, [%g2]; \ + sethi %hi(NOP), %g3; \ + or %g3, %lo(NOP), %g3; \ + stw %g3, [%g2 + 0x4]; \ + flush %g2; + + .globl generic_patch_copyops + .type generic_patch_copyops,#function +generic_patch_copyops: + GEN_DO_PATCH(memcpy, GENmemcpy) + GEN_DO_PATCH(___copy_from_user, GENcopy_from_user) + GEN_DO_PATCH(___copy_to_user, GENcopy_to_user) + retl + nop + .size generic_patch_copyops,.-generic_patch_copyops diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 76effdbea07..3269b023409 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -1,14 +1,47 @@ -# $Id: Makefile,v 1.35 2000/12/15 00:41:18 davem Exp $ # Makefile for Sparc library files.. # -EXTRA_AFLAGS := -ansi -DST_DIV0=0x02 +asflags-y := -ansi -DST_DIV0=0x02 +ccflags-y := -Werror -lib-y := mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o memcpy.o memset.o \ - strlen.o checksum.o blockops.o memscan.o memcmp.o strncmp.o \ - strncpy_from_user.o divdi3.o udivdi3.o strlen_user.o \ - copy_user.o locks.o atomic.o \ - lshrdi3.o ashldi3.o rwsem.o muldi3.o bitext.o \ - cmpdi2.o +lib-$(CONFIG_SPARC32) += ashrdi3.o +lib-$(CONFIG_SPARC32) += memcpy.o memset.o +lib-y += strlen.o +lib-y += checksum_$(BITS).o +lib-$(CONFIG_SPARC32) += blockops.o +lib-y += memscan_$(BITS).o memcmp.o strncmp_$(BITS).o +lib-$(CONFIG_SPARC32) += divdi3.o udivdi3.o +lib-$(CONFIG_SPARC32) += copy_user.o locks.o +lib-$(CONFIG_SPARC64) += atomic_64.o +lib-$(CONFIG_SPARC32) += lshrdi3.o ashldi3.o +lib-$(CONFIG_SPARC32) += muldi3.o bitext.o cmpdi2.o -obj-y += iomap.o atomic32.o +lib-$(CONFIG_SPARC64) += copy_page.o clear_page.o bzero.o +lib-$(CONFIG_SPARC64) += csum_copy.o csum_copy_from_user.o csum_copy_to_user.o +lib-$(CONFIG_SPARC64) += VISsave.o +lib-$(CONFIG_SPARC64) += bitops.o + +lib-$(CONFIG_SPARC64) += U1memcpy.o U1copy_from_user.o U1copy_to_user.o + +lib-$(CONFIG_SPARC64) += U3memcpy.o U3copy_from_user.o U3copy_to_user.o +lib-$(CONFIG_SPARC64) += U3patch.o + +lib-$(CONFIG_SPARC64) += NGmemcpy.o NGcopy_from_user.o NGcopy_to_user.o +lib-$(CONFIG_SPARC64) += NGpatch.o NGpage.o NGbzero.o + +lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o +lib-$(CONFIG_SPARC64) += NG2patch.o + +lib-$(CONFIG_SPARC64) += NG4memcpy.o NG4copy_from_user.o NG4copy_to_user.o +lib-$(CONFIG_SPARC64) += NG4patch.o NG4copy_page.o NG4clear_page.o NG4memset.o + +lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o +lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o + +lib-$(CONFIG_SPARC64) += copy_in_user.o user_fixup.o memmove.o +lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o xor.o hweight.o ffs.o + +obj-$(CONFIG_SPARC64) += iomap.o +obj-$(CONFIG_SPARC32) += atomic32.o ucmpdi2.o +obj-y += ksyms.o +obj-$(CONFIG_SPARC64) += PeeCeeI.o diff --git a/arch/sparc/lib/NG2copy_from_user.S b/arch/sparc/lib/NG2copy_from_user.S new file mode 100644 index 00000000000..119ccb9a54f --- /dev/null +++ b/arch/sparc/lib/NG2copy_from_user.S @@ -0,0 +1,35 @@ +/* NG2copy_from_user.S: Niagara-2 optimized copy from userspace. + * + * Copyright (C) 2007 David S. Miller (davem@davemloft.net) + */ + +#define EX_LD(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one_asi;\ + .text; \ + .align 4; + +#ifndef ASI_AIUS +#define ASI_AIUS 0x11 +#endif + +#ifndef ASI_BLK_AIUS_4V +#define ASI_BLK_AIUS_4V 0x17 +#endif + +#define FUNC_NAME NG2copy_from_user +#define LOAD(type,addr,dest) type##a [addr] %asi, dest +#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_AIUS_4V, dest +#define EX_RETVAL(x) 0 + +#ifdef __KERNEL__ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop +#endif + +#include "NG2memcpy.S" diff --git a/arch/sparc/lib/NG2copy_to_user.S b/arch/sparc/lib/NG2copy_to_user.S new file mode 100644 index 00000000000..7fe1ccefd9d --- /dev/null +++ b/arch/sparc/lib/NG2copy_to_user.S @@ -0,0 +1,44 @@ +/* NG2copy_to_user.S: Niagara-2 optimized copy to userspace. + * + * Copyright (C) 2007 David S. Miller (davem@davemloft.net) + */ + +#define EX_ST(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one_asi;\ + .text; \ + .align 4; + +#ifndef ASI_AIUS +#define ASI_AIUS 0x11 +#endif + +#ifndef ASI_BLK_AIUS_4V +#define ASI_BLK_AIUS_4V 0x17 +#endif + +#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS +#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23 +#endif + +#define FUNC_NAME NG2copy_to_user +#define STORE(type,src,addr) type##a src, [addr] ASI_AIUS +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS +#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_AIUS_4V +#define EX_RETVAL(x) 0 + +#ifdef __KERNEL__ + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop +#endif + +#include "NG2memcpy.S" diff --git a/arch/sparc/lib/NG2memcpy.S b/arch/sparc/lib/NG2memcpy.S new file mode 100644 index 00000000000..30eee6e8a81 --- /dev/null +++ b/arch/sparc/lib/NG2memcpy.S @@ -0,0 +1,521 @@ +/* NG2memcpy.S: Niagara-2 optimized memcpy. + * + * Copyright (C) 2007 David S. Miller (davem@davemloft.net) + */ + +#ifdef __KERNEL__ +#include <asm/visasm.h> +#include <asm/asi.h> +#define GLOBAL_SPARE %g7 +#else +#define ASI_PNF 0x82 +#define ASI_BLK_P 0xf0 +#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 +#define FPRS_FEF 0x04 +#ifdef MEMCPY_DEBUG +#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ + clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#else +#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#endif +#define GLOBAL_SPARE %g5 +#endif + +#ifndef STORE_ASI +#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P +#else +#define STORE_ASI 0x80 /* ASI_P */ +#endif +#endif + +#ifndef EX_LD +#define EX_LD(x) x +#endif + +#ifndef EX_ST +#define EX_ST(x) x +#endif + +#ifndef EX_RETVAL +#define EX_RETVAL(x) x +#endif + +#ifndef LOAD +#define LOAD(type,addr,dest) type [addr], dest +#endif + +#ifndef LOAD_BLK +#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest +#endif + +#ifndef STORE +#ifndef MEMCPY_DEBUG +#define STORE(type,src,addr) type src, [addr] +#else +#define STORE(type,src,addr) type##a src, [addr] 0x80 +#endif +#endif + +#ifndef STORE_BLK +#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P +#endif + +#ifndef STORE_INIT +#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI +#endif + +#ifndef FUNC_NAME +#define FUNC_NAME NG2memcpy +#endif + +#ifndef PREAMBLE +#define PREAMBLE +#endif + +#ifndef XCC +#define XCC xcc +#endif + +#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \ + faligndata %x0, %x1, %f0; \ + faligndata %x1, %x2, %f2; \ + faligndata %x2, %x3, %f4; \ + faligndata %x3, %x4, %f6; \ + faligndata %x4, %x5, %f8; \ + faligndata %x5, %x6, %f10; \ + faligndata %x6, %x7, %f12; \ + faligndata %x7, %x8, %f14; + +#define FREG_MOVE_1(x0) \ + fsrc2 %x0, %f0; +#define FREG_MOVE_2(x0, x1) \ + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; +#define FREG_MOVE_3(x0, x1, x2) \ + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; \ + fsrc2 %x2, %f4; +#define FREG_MOVE_4(x0, x1, x2, x3) \ + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; \ + fsrc2 %x2, %f4; \ + fsrc2 %x3, %f6; +#define FREG_MOVE_5(x0, x1, x2, x3, x4) \ + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; \ + fsrc2 %x2, %f4; \ + fsrc2 %x3, %f6; \ + fsrc2 %x4, %f8; +#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \ + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; \ + fsrc2 %x2, %f4; \ + fsrc2 %x3, %f6; \ + fsrc2 %x4, %f8; \ + fsrc2 %x5, %f10; +#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \ + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; \ + fsrc2 %x2, %f4; \ + fsrc2 %x3, %f6; \ + fsrc2 %x4, %f8; \ + fsrc2 %x5, %f10; \ + fsrc2 %x6, %f12; +#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \ + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; \ + fsrc2 %x2, %f4; \ + fsrc2 %x3, %f6; \ + fsrc2 %x4, %f8; \ + fsrc2 %x5, %f10; \ + fsrc2 %x6, %f12; \ + fsrc2 %x7, %f14; +#define FREG_LOAD_1(base, x0) \ + EX_LD(LOAD(ldd, base + 0x00, %x0)) +#define FREG_LOAD_2(base, x0, x1) \ + EX_LD(LOAD(ldd, base + 0x00, %x0)); \ + EX_LD(LOAD(ldd, base + 0x08, %x1)); +#define FREG_LOAD_3(base, x0, x1, x2) \ + EX_LD(LOAD(ldd, base + 0x00, %x0)); \ + EX_LD(LOAD(ldd, base + 0x08, %x1)); \ + EX_LD(LOAD(ldd, base + 0x10, %x2)); +#define FREG_LOAD_4(base, x0, x1, x2, x3) \ + EX_LD(LOAD(ldd, base + 0x00, %x0)); \ + EX_LD(LOAD(ldd, base + 0x08, %x1)); \ + EX_LD(LOAD(ldd, base + 0x10, %x2)); \ + EX_LD(LOAD(ldd, base + 0x18, %x3)); +#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \ + EX_LD(LOAD(ldd, base + 0x00, %x0)); \ + EX_LD(LOAD(ldd, base + 0x08, %x1)); \ + EX_LD(LOAD(ldd, base + 0x10, %x2)); \ + EX_LD(LOAD(ldd, base + 0x18, %x3)); \ + EX_LD(LOAD(ldd, base + 0x20, %x4)); +#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \ + EX_LD(LOAD(ldd, base + 0x00, %x0)); \ + EX_LD(LOAD(ldd, base + 0x08, %x1)); \ + EX_LD(LOAD(ldd, base + 0x10, %x2)); \ + EX_LD(LOAD(ldd, base + 0x18, %x3)); \ + EX_LD(LOAD(ldd, base + 0x20, %x4)); \ + EX_LD(LOAD(ldd, base + 0x28, %x5)); +#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \ + EX_LD(LOAD(ldd, base + 0x00, %x0)); \ + EX_LD(LOAD(ldd, base + 0x08, %x1)); \ + EX_LD(LOAD(ldd, base + 0x10, %x2)); \ + EX_LD(LOAD(ldd, base + 0x18, %x3)); \ + EX_LD(LOAD(ldd, base + 0x20, %x4)); \ + EX_LD(LOAD(ldd, base + 0x28, %x5)); \ + EX_LD(LOAD(ldd, base + 0x30, %x6)); + + .register %g2,#scratch + .register %g3,#scratch + + .text + .align 64 + + .globl FUNC_NAME + .type FUNC_NAME,#function +FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ + srlx %o2, 31, %g2 + cmp %g2, 0 + tne %xcc, 5 + PREAMBLE + mov %o0, %o3 + cmp %o2, 0 + be,pn %XCC, 85f + or %o0, %o1, GLOBAL_SPARE + cmp %o2, 16 + blu,a,pn %XCC, 80f + or GLOBAL_SPARE, %o2, GLOBAL_SPARE + + /* 2 blocks (128 bytes) is the minimum we can do the block + * copy with. We need to ensure that we'll iterate at least + * once in the block copy loop. At worst we'll need to align + * the destination to a 64-byte boundary which can chew up + * to (64 - 1) bytes from the length before we perform the + * block copy loop. + * + * However, the cut-off point, performance wise, is around + * 4 64-byte blocks. + */ + cmp %o2, (4 * 64) + blu,pt %XCC, 75f + andcc GLOBAL_SPARE, 0x7, %g0 + + /* %o0: dst + * %o1: src + * %o2: len (known to be >= 128) + * + * The block copy loops can use %o4, %g2, %g3 as + * temporaries while copying the data. %o5 must + * be preserved between VISEntryHalf and VISExitHalf + */ + + LOAD(prefetch, %o1 + 0x000, #one_read) + LOAD(prefetch, %o1 + 0x040, #one_read) + LOAD(prefetch, %o1 + 0x080, #one_read) + + /* Align destination on 64-byte boundary. */ + andcc %o0, (64 - 1), %o4 + be,pt %XCC, 2f + sub %o4, 64, %o4 + sub %g0, %o4, %o4 ! bytes to align dst + sub %o2, %o4, %o2 +1: subcc %o4, 1, %o4 + EX_LD(LOAD(ldub, %o1, %g1)) + EX_ST(STORE(stb, %g1, %o0)) + add %o1, 1, %o1 + bne,pt %XCC, 1b + add %o0, 1, %o0 + +2: + /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve + * o5 from here until we hit VISExitHalf. + */ + VISEntryHalf + + membar #Sync + alignaddr %o1, %g0, %g0 + + add %o1, (64 - 1), %o4 + andn %o4, (64 - 1), %o4 + andn %o2, (64 - 1), %g1 + sub %o2, %g1, %o2 + + and %o1, (64 - 1), %g2 + add %o1, %g1, %o1 + sub %o0, %o4, %g3 + brz,pt %g2, 190f + cmp %g2, 32 + blu,a 5f + cmp %g2, 16 + cmp %g2, 48 + blu,a 4f + cmp %g2, 40 + cmp %g2, 56 + blu 170f + nop + ba,a,pt %xcc, 180f + +4: /* 32 <= low bits < 48 */ + blu 150f + nop + ba,a,pt %xcc, 160f +5: /* 0 < low bits < 32 */ + blu,a 6f + cmp %g2, 8 + cmp %g2, 24 + blu 130f + nop + ba,a,pt %xcc, 140f +6: /* 0 < low bits < 16 */ + bgeu 120f + nop + /* fall through for 0 < low bits < 8 */ +110: sub %o4, 64, %g2 + EX_LD(LOAD_BLK(%g2, %f0)) +1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) + EX_LD(LOAD_BLK(%o4, %f16)) + FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16) + EX_ST(STORE_BLK(%f0, %o4 + %g3)) + FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30) + subcc %g1, 64, %g1 + add %o4, 64, %o4 + bne,pt %xcc, 1b + LOAD(prefetch, %o4 + 64, #one_read) + ba,pt %xcc, 195f + nop + +120: sub %o4, 56, %g2 + FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12) +1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) + EX_LD(LOAD_BLK(%o4, %f16)) + FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18) + EX_ST(STORE_BLK(%f0, %o4 + %g3)) + FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30) + subcc %g1, 64, %g1 + add %o4, 64, %o4 + bne,pt %xcc, 1b + LOAD(prefetch, %o4 + 64, #one_read) + ba,pt %xcc, 195f + nop + +130: sub %o4, 48, %g2 + FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10) +1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) + EX_LD(LOAD_BLK(%o4, %f16)) + FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20) + EX_ST(STORE_BLK(%f0, %o4 + %g3)) + FREG_MOVE_6(f20, f22, f24, f26, f28, f30) + subcc %g1, 64, %g1 + add %o4, 64, %o4 + bne,pt %xcc, 1b + LOAD(prefetch, %o4 + 64, #one_read) + ba,pt %xcc, 195f + nop + +140: sub %o4, 40, %g2 + FREG_LOAD_5(%g2, f0, f2, f4, f6, f8) +1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) + EX_LD(LOAD_BLK(%o4, %f16)) + FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22) + EX_ST(STORE_BLK(%f0, %o4 + %g3)) + FREG_MOVE_5(f22, f24, f26, f28, f30) + subcc %g1, 64, %g1 + add %o4, 64, %o4 + bne,pt %xcc, 1b + LOAD(prefetch, %o4 + 64, #one_read) + ba,pt %xcc, 195f + nop + +150: sub %o4, 32, %g2 + FREG_LOAD_4(%g2, f0, f2, f4, f6) +1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) + EX_LD(LOAD_BLK(%o4, %f16)) + FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24) + EX_ST(STORE_BLK(%f0, %o4 + %g3)) + FREG_MOVE_4(f24, f26, f28, f30) + subcc %g1, 64, %g1 + add %o4, 64, %o4 + bne,pt %xcc, 1b + LOAD(prefetch, %o4 + 64, #one_read) + ba,pt %xcc, 195f + nop + +160: sub %o4, 24, %g2 + FREG_LOAD_3(%g2, f0, f2, f4) +1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) + EX_LD(LOAD_BLK(%o4, %f16)) + FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26) + EX_ST(STORE_BLK(%f0, %o4 + %g3)) + FREG_MOVE_3(f26, f28, f30) + subcc %g1, 64, %g1 + add %o4, 64, %o4 + bne,pt %xcc, 1b + LOAD(prefetch, %o4 + 64, #one_read) + ba,pt %xcc, 195f + nop + +170: sub %o4, 16, %g2 + FREG_LOAD_2(%g2, f0, f2) +1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) + EX_LD(LOAD_BLK(%o4, %f16)) + FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28) + EX_ST(STORE_BLK(%f0, %o4 + %g3)) + FREG_MOVE_2(f28, f30) + subcc %g1, 64, %g1 + add %o4, 64, %o4 + bne,pt %xcc, 1b + LOAD(prefetch, %o4 + 64, #one_read) + ba,pt %xcc, 195f + nop + +180: sub %o4, 8, %g2 + FREG_LOAD_1(%g2, f0) +1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) + EX_LD(LOAD_BLK(%o4, %f16)) + FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30) + EX_ST(STORE_BLK(%f0, %o4 + %g3)) + FREG_MOVE_1(f30) + subcc %g1, 64, %g1 + add %o4, 64, %o4 + bne,pt %xcc, 1b + LOAD(prefetch, %o4 + 64, #one_read) + ba,pt %xcc, 195f + nop + +190: +1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) + subcc %g1, 64, %g1 + EX_LD(LOAD_BLK(%o4, %f0)) + EX_ST(STORE_BLK(%f0, %o4 + %g3)) + add %o4, 64, %o4 + bne,pt %xcc, 1b + LOAD(prefetch, %o4 + 64, #one_read) + +195: + add %o4, %g3, %o0 + membar #Sync + + VISExitHalf + + /* %o2 contains any final bytes still needed to be copied + * over. If anything is left, we copy it one byte at a time. + */ + brz,pt %o2, 85f + sub %o0, %o1, GLOBAL_SPARE + ba,a,pt %XCC, 90f + + .align 64 +75: /* 16 < len <= 64 */ + bne,pn %XCC, 75f + sub %o0, %o1, GLOBAL_SPARE + +72: + andn %o2, 0xf, %o4 + and %o2, 0xf, %o2 +1: subcc %o4, 0x10, %o4 + EX_LD(LOAD(ldx, %o1, %o5)) + add %o1, 0x08, %o1 + EX_LD(LOAD(ldx, %o1, %g1)) + sub %o1, 0x08, %o1 + EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE)) + add %o1, 0x8, %o1 + EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE)) + bgu,pt %XCC, 1b + add %o1, 0x8, %o1 +73: andcc %o2, 0x8, %g0 + be,pt %XCC, 1f + nop + sub %o2, 0x8, %o2 + EX_LD(LOAD(ldx, %o1, %o5)) + EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE)) + add %o1, 0x8, %o1 +1: andcc %o2, 0x4, %g0 + be,pt %XCC, 1f + nop + sub %o2, 0x4, %o2 + EX_LD(LOAD(lduw, %o1, %o5)) + EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE)) + add %o1, 0x4, %o1 +1: cmp %o2, 0 + be,pt %XCC, 85f + nop + ba,pt %xcc, 90f + nop + +75: + andcc %o0, 0x7, %g1 + sub %g1, 0x8, %g1 + be,pn %icc, 2f + sub %g0, %g1, %g1 + sub %o2, %g1, %o2 + +1: subcc %g1, 1, %g1 + EX_LD(LOAD(ldub, %o1, %o5)) + EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE)) + bgu,pt %icc, 1b + add %o1, 1, %o1 + +2: add %o1, GLOBAL_SPARE, %o0 + andcc %o1, 0x7, %g1 + bne,pt %icc, 8f + sll %g1, 3, %g1 + + cmp %o2, 16 + bgeu,pt %icc, 72b + nop + ba,a,pt %xcc, 73b + +8: mov 64, GLOBAL_SPARE + andn %o1, 0x7, %o1 + EX_LD(LOAD(ldx, %o1, %g2)) + sub GLOBAL_SPARE, %g1, GLOBAL_SPARE + andn %o2, 0x7, %o4 + sllx %g2, %g1, %g2 +1: add %o1, 0x8, %o1 + EX_LD(LOAD(ldx, %o1, %g3)) + subcc %o4, 0x8, %o4 + srlx %g3, GLOBAL_SPARE, %o5 + or %o5, %g2, %o5 + EX_ST(STORE(stx, %o5, %o0)) + add %o0, 0x8, %o0 + bgu,pt %icc, 1b + sllx %g3, %g1, %g2 + + srl %g1, 3, %g1 + andcc %o2, 0x7, %o2 + be,pn %icc, 85f + add %o1, %g1, %o1 + ba,pt %xcc, 90f + sub %o0, %o1, GLOBAL_SPARE + + .align 64 +80: /* 0 < len <= 16 */ + andcc GLOBAL_SPARE, 0x3, %g0 + bne,pn %XCC, 90f + sub %o0, %o1, GLOBAL_SPARE + +1: + subcc %o2, 4, %o2 + EX_LD(LOAD(lduw, %o1, %g1)) + EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE)) + bgu,pt %XCC, 1b + add %o1, 4, %o1 + +85: retl + mov EX_RETVAL(%o3), %o0 + + .align 32 +90: + subcc %o2, 1, %o2 + EX_LD(LOAD(ldub, %o1, %g1)) + EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE)) + bgu,pt %XCC, 90b + add %o1, 1, %o1 + retl + mov EX_RETVAL(%o3), %o0 + + .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc/lib/NG2patch.S b/arch/sparc/lib/NG2patch.S new file mode 100644 index 00000000000..28c36f06a6d --- /dev/null +++ b/arch/sparc/lib/NG2patch.S @@ -0,0 +1,33 @@ +/* NG2patch.S: Patch Ultra-I routines with Niagara-2 variant. + * + * Copyright (C) 2007 David S. Miller <davem@davemloft.net> + */ + +#define BRANCH_ALWAYS 0x10680000 +#define NOP 0x01000000 +#define NG_DO_PATCH(OLD, NEW) \ + sethi %hi(NEW), %g1; \ + or %g1, %lo(NEW), %g1; \ + sethi %hi(OLD), %g2; \ + or %g2, %lo(OLD), %g2; \ + sub %g1, %g2, %g1; \ + sethi %hi(BRANCH_ALWAYS), %g3; \ + sll %g1, 11, %g1; \ + srl %g1, 11 + 2, %g1; \ + or %g3, %lo(BRANCH_ALWAYS), %g3; \ + or %g3, %g1, %g3; \ + stw %g3, [%g2]; \ + sethi %hi(NOP), %g3; \ + or %g3, %lo(NOP), %g3; \ + stw %g3, [%g2 + 0x4]; \ + flush %g2; + + .globl niagara2_patch_copyops + .type niagara2_patch_copyops,#function +niagara2_patch_copyops: + NG_DO_PATCH(memcpy, NG2memcpy) + NG_DO_PATCH(___copy_from_user, NG2copy_from_user) + NG_DO_PATCH(___copy_to_user, NG2copy_to_user) + retl + nop + .size niagara2_patch_copyops,.-niagara2_patch_copyops diff --git a/arch/sparc/lib/NG4clear_page.S b/arch/sparc/lib/NG4clear_page.S new file mode 100644 index 00000000000..e16c88204a4 --- /dev/null +++ b/arch/sparc/lib/NG4clear_page.S @@ -0,0 +1,29 @@ +/* NG4copy_page.S: Niagara-4 optimized clear page. + * + * Copyright (C) 2012 (davem@davemloft.net) + */ + +#include <asm/asi.h> +#include <asm/page.h> + + .text + + .register %g3, #scratch + + .align 32 + .globl NG4clear_page + .globl NG4clear_user_page +NG4clear_page: /* %o0=dest */ +NG4clear_user_page: /* %o0=dest, %o1=vaddr */ + set PAGE_SIZE, %g7 + mov 0x20, %g3 +1: stxa %g0, [%o0 + %g0] ASI_ST_BLKINIT_MRU_P + subcc %g7, 0x40, %g7 + stxa %g0, [%o0 + %g3] ASI_ST_BLKINIT_MRU_P + bne,pt %xcc, 1b + add %o0, 0x40, %o0 + membar #StoreLoad|#StoreStore + retl + nop + .size NG4clear_page,.-NG4clear_page + .size NG4clear_user_page,.-NG4clear_user_page
\ No newline at end of file diff --git a/arch/sparc/lib/NG4copy_from_user.S b/arch/sparc/lib/NG4copy_from_user.S new file mode 100644 index 00000000000..fd9f903ffa3 --- /dev/null +++ b/arch/sparc/lib/NG4copy_from_user.S @@ -0,0 +1,30 @@ +/* NG4copy_from_user.S: Niagara-4 optimized copy from userspace. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#define EX_LD(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one_asi;\ + .text; \ + .align 4; + +#ifndef ASI_AIUS +#define ASI_AIUS 0x11 +#endif + +#define FUNC_NAME NG4copy_from_user +#define LOAD(type,addr,dest) type##a [addr] %asi, dest +#define EX_RETVAL(x) 0 + +#ifdef __KERNEL__ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop +#endif + +#include "NG4memcpy.S" diff --git a/arch/sparc/lib/NG4copy_page.S b/arch/sparc/lib/NG4copy_page.S new file mode 100644 index 00000000000..28504e88c53 --- /dev/null +++ b/arch/sparc/lib/NG4copy_page.S @@ -0,0 +1,57 @@ +/* NG4copy_page.S: Niagara-4 optimized copy page. + * + * Copyright (C) 2012 (davem@davemloft.net) + */ + +#include <asm/asi.h> +#include <asm/page.h> + + .text + .align 32 + + .register %g2, #scratch + .register %g3, #scratch + + .globl NG4copy_user_page +NG4copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ + prefetch [%o1 + 0x000], #n_reads_strong + prefetch [%o1 + 0x040], #n_reads_strong + prefetch [%o1 + 0x080], #n_reads_strong + prefetch [%o1 + 0x0c0], #n_reads_strong + set PAGE_SIZE, %g7 + prefetch [%o1 + 0x100], #n_reads_strong + prefetch [%o1 + 0x140], #n_reads_strong + prefetch [%o1 + 0x180], #n_reads_strong + prefetch [%o1 + 0x1c0], #n_reads_strong +1: + ldx [%o1 + 0x00], %o2 + subcc %g7, 0x40, %g7 + ldx [%o1 + 0x08], %o3 + ldx [%o1 + 0x10], %o4 + ldx [%o1 + 0x18], %o5 + ldx [%o1 + 0x20], %g1 + stxa %o2, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + ldx [%o1 + 0x28], %g2 + stxa %o3, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + ldx [%o1 + 0x30], %g3 + stxa %o4, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + ldx [%o1 + 0x38], %o2 + add %o1, 0x40, %o1 + stxa %o5, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + stxa %g1, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + stxa %g2, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + stxa %g3, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + stxa %o2, [%o0] ASI_ST_BLKINIT_MRU_P + add %o0, 0x08, %o0 + bne,pt %icc, 1b + prefetch [%o1 + 0x200], #n_reads_strong + retl + membar #StoreLoad | #StoreStore + .size NG4copy_user_page,.-NG4copy_user_page diff --git a/arch/sparc/lib/NG4copy_to_user.S b/arch/sparc/lib/NG4copy_to_user.S new file mode 100644 index 00000000000..9744c4540a8 --- /dev/null +++ b/arch/sparc/lib/NG4copy_to_user.S @@ -0,0 +1,39 @@ +/* NG4copy_to_user.S: Niagara-4 optimized copy to userspace. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#define EX_ST(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one_asi;\ + .text; \ + .align 4; + +#ifndef ASI_AIUS +#define ASI_AIUS 0x11 +#endif + +#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS +#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23 +#endif + +#define FUNC_NAME NG4copy_to_user +#define STORE(type,src,addr) type##a src, [addr] %asi +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS +#define EX_RETVAL(x) 0 + +#ifdef __KERNEL__ + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop +#endif + +#include "NG4memcpy.S" diff --git a/arch/sparc/lib/NG4memcpy.S b/arch/sparc/lib/NG4memcpy.S new file mode 100644 index 00000000000..9cf2ee01cee --- /dev/null +++ b/arch/sparc/lib/NG4memcpy.S @@ -0,0 +1,360 @@ +/* NG4memcpy.S: Niagara-4 optimized memcpy. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#ifdef __KERNEL__ +#include <asm/visasm.h> +#include <asm/asi.h> +#define GLOBAL_SPARE %g7 +#else +#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 +#define FPRS_FEF 0x04 + +/* On T4 it is very expensive to access ASRs like %fprs and + * %asi, avoiding a read or a write can save ~50 cycles. + */ +#define FPU_ENTER \ + rd %fprs, %o5; \ + andcc %o5, FPRS_FEF, %g0; \ + be,a,pn %icc, 999f; \ + wr %g0, FPRS_FEF, %fprs; \ + 999: + +#ifdef MEMCPY_DEBUG +#define VISEntryHalf FPU_ENTER; \ + clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#else +#define VISEntryHalf FPU_ENTER +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#endif + +#define GLOBAL_SPARE %g5 +#endif + +#ifndef STORE_ASI +#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P +#else +#define STORE_ASI 0x80 /* ASI_P */ +#endif +#endif + +#ifndef EX_LD +#define EX_LD(x) x +#endif + +#ifndef EX_ST +#define EX_ST(x) x +#endif + +#ifndef EX_RETVAL +#define EX_RETVAL(x) x +#endif + +#ifndef LOAD +#define LOAD(type,addr,dest) type [addr], dest +#endif + +#ifndef STORE +#ifndef MEMCPY_DEBUG +#define STORE(type,src,addr) type src, [addr] +#else +#define STORE(type,src,addr) type##a src, [addr] %asi +#endif +#endif + +#ifndef STORE_INIT +#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI +#endif + +#ifndef FUNC_NAME +#define FUNC_NAME NG4memcpy +#endif +#ifndef PREAMBLE +#define PREAMBLE +#endif + +#ifndef XCC +#define XCC xcc +#endif + + .register %g2,#scratch + .register %g3,#scratch + + .text + .align 64 + + .globl FUNC_NAME + .type FUNC_NAME,#function +FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ +#ifdef MEMCPY_DEBUG + wr %g0, 0x80, %asi +#endif + srlx %o2, 31, %g2 + cmp %g2, 0 + tne %XCC, 5 + PREAMBLE + mov %o0, %o3 + brz,pn %o2, .Lexit + cmp %o2, 3 + ble,pn %icc, .Ltiny + cmp %o2, 19 + ble,pn %icc, .Lsmall + or %o0, %o1, %g2 + cmp %o2, 128 + bl,pn %icc, .Lmedium + nop + +.Llarge:/* len >= 0x80 */ + /* First get dest 8 byte aligned. */ + sub %g0, %o0, %g1 + and %g1, 0x7, %g1 + brz,pt %g1, 51f + sub %o2, %g1, %o2 + +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) + add %o1, 1, %o1 + subcc %g1, 1, %g1 + add %o0, 1, %o0 + bne,pt %icc, 1b + EX_ST(STORE(stb, %g2, %o0 - 0x01)) + +51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) + LOAD(prefetch, %o1 + 0x080, #n_reads_strong) + LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) + LOAD(prefetch, %o1 + 0x100, #n_reads_strong) + LOAD(prefetch, %o1 + 0x140, #n_reads_strong) + LOAD(prefetch, %o1 + 0x180, #n_reads_strong) + LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) + LOAD(prefetch, %o1 + 0x200, #n_reads_strong) + + /* Check if we can use the straight fully aligned + * loop, or we require the alignaddr/faligndata variant. + */ + andcc %o1, 0x7, %o5 + bne,pn %icc, .Llarge_src_unaligned + sub %g0, %o0, %g1 + + /* Legitimize the use of initializing stores by getting dest + * to be 64-byte aligned. + */ + and %g1, 0x3f, %g1 + brz,pt %g1, .Llarge_aligned + sub %o2, %g1, %o2 + +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2)) + add %o1, 8, %o1 + subcc %g1, 8, %g1 + add %o0, 8, %o0 + bne,pt %icc, 1b + EX_ST(STORE(stx, %g2, %o0 - 0x08)) + +.Llarge_aligned: + /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ + andn %o2, 0x3f, %o4 + sub %o2, %o4, %o2 + +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) + add %o1, 0x40, %o1 + EX_LD(LOAD(ldx, %o1 - 0x38, %g2)) + subcc %o4, 0x40, %o4 + EX_LD(LOAD(ldx, %o1 - 0x30, %g3)) + EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE)) + EX_LD(LOAD(ldx, %o1 - 0x20, %o5)) + EX_ST(STORE_INIT(%g1, %o0)) + add %o0, 0x08, %o0 + EX_ST(STORE_INIT(%g2, %o0)) + add %o0, 0x08, %o0 + EX_LD(LOAD(ldx, %o1 - 0x18, %g2)) + EX_ST(STORE_INIT(%g3, %o0)) + add %o0, 0x08, %o0 + EX_LD(LOAD(ldx, %o1 - 0x10, %g3)) + EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) + add %o0, 0x08, %o0 + EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE)) + EX_ST(STORE_INIT(%o5, %o0)) + add %o0, 0x08, %o0 + EX_ST(STORE_INIT(%g2, %o0)) + add %o0, 0x08, %o0 + EX_ST(STORE_INIT(%g3, %o0)) + add %o0, 0x08, %o0 + EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) + add %o0, 0x08, %o0 + bne,pt %icc, 1b + LOAD(prefetch, %o1 + 0x200, #n_reads_strong) + + membar #StoreLoad | #StoreStore + + brz,pn %o2, .Lexit + cmp %o2, 19 + ble,pn %icc, .Lsmall_unaligned + nop + ba,a,pt %icc, .Lmedium_noprefetch + +.Lexit: retl + mov EX_RETVAL(%o3), %o0 + +.Llarge_src_unaligned: + andn %o2, 0x3f, %o4 + sub %o2, %o4, %o2 + VISEntryHalf + alignaddr %o1, %g0, %g1 + add %o1, %o4, %o1 + EX_LD(LOAD(ldd, %g1 + 0x00, %f0)) +1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2)) + subcc %o4, 0x40, %o4 + EX_LD(LOAD(ldd, %g1 + 0x10, %f4)) + EX_LD(LOAD(ldd, %g1 + 0x18, %f6)) + EX_LD(LOAD(ldd, %g1 + 0x20, %f8)) + EX_LD(LOAD(ldd, %g1 + 0x28, %f10)) + EX_LD(LOAD(ldd, %g1 + 0x30, %f12)) + EX_LD(LOAD(ldd, %g1 + 0x38, %f14)) + faligndata %f0, %f2, %f16 + EX_LD(LOAD(ldd, %g1 + 0x40, %f0)) + faligndata %f2, %f4, %f18 + add %g1, 0x40, %g1 + faligndata %f4, %f6, %f20 + faligndata %f6, %f8, %f22 + faligndata %f8, %f10, %f24 + faligndata %f10, %f12, %f26 + faligndata %f12, %f14, %f28 + faligndata %f14, %f0, %f30 + EX_ST(STORE(std, %f16, %o0 + 0x00)) + EX_ST(STORE(std, %f18, %o0 + 0x08)) + EX_ST(STORE(std, %f20, %o0 + 0x10)) + EX_ST(STORE(std, %f22, %o0 + 0x18)) + EX_ST(STORE(std, %f24, %o0 + 0x20)) + EX_ST(STORE(std, %f26, %o0 + 0x28)) + EX_ST(STORE(std, %f28, %o0 + 0x30)) + EX_ST(STORE(std, %f30, %o0 + 0x38)) + add %o0, 0x40, %o0 + bne,pt %icc, 1b + LOAD(prefetch, %g1 + 0x200, #n_reads_strong) + VISExitHalf + + brz,pn %o2, .Lexit + cmp %o2, 19 + ble,pn %icc, .Lsmall_unaligned + nop + ba,a,pt %icc, .Lmedium_unaligned + +.Lmedium: + LOAD(prefetch, %o1 + 0x40, #n_reads_strong) + andcc %g2, 0x7, %g0 + bne,pn %icc, .Lmedium_unaligned + nop +.Lmedium_noprefetch: + andncc %o2, 0x20 - 1, %o5 + be,pn %icc, 2f + sub %o2, %o5, %o2 +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) + EX_LD(LOAD(ldx, %o1 + 0x08, %g2)) + EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE)) + EX_LD(LOAD(ldx, %o1 + 0x18, %o4)) + add %o1, 0x20, %o1 + subcc %o5, 0x20, %o5 + EX_ST(STORE(stx, %g1, %o0 + 0x00)) + EX_ST(STORE(stx, %g2, %o0 + 0x08)) + EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10)) + EX_ST(STORE(stx, %o4, %o0 + 0x18)) + bne,pt %icc, 1b + add %o0, 0x20, %o0 +2: andcc %o2, 0x18, %o5 + be,pt %icc, 3f + sub %o2, %o5, %o2 +1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) + add %o1, 0x08, %o1 + add %o0, 0x08, %o0 + subcc %o5, 0x08, %o5 + bne,pt %icc, 1b + EX_ST(STORE(stx, %g1, %o0 - 0x08)) +3: brz,pt %o2, .Lexit + cmp %o2, 0x04 + bl,pn %icc, .Ltiny + nop + EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) + add %o1, 0x04, %o1 + add %o0, 0x04, %o0 + subcc %o2, 0x04, %o2 + bne,pn %icc, .Ltiny + EX_ST(STORE(stw, %g1, %o0 - 0x04)) + ba,a,pt %icc, .Lexit +.Lmedium_unaligned: + /* First get dest 8 byte aligned. */ + sub %g0, %o0, %g1 + and %g1, 0x7, %g1 + brz,pt %g1, 2f + sub %o2, %g1, %o2 + +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) + add %o1, 1, %o1 + subcc %g1, 1, %g1 + add %o0, 1, %o0 + bne,pt %icc, 1b + EX_ST(STORE(stb, %g2, %o0 - 0x01)) +2: + and %o1, 0x7, %g1 + brz,pn %g1, .Lmedium_noprefetch + sll %g1, 3, %g1 + mov 64, %g2 + sub %g2, %g1, %g2 + andn %o1, 0x7, %o1 + EX_LD(LOAD(ldx, %o1 + 0x00, %o4)) + sllx %o4, %g1, %o4 + andn %o2, 0x08 - 1, %o5 + sub %o2, %o5, %o2 +1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3)) + add %o1, 0x08, %o1 + subcc %o5, 0x08, %o5 + srlx %g3, %g2, GLOBAL_SPARE + or GLOBAL_SPARE, %o4, GLOBAL_SPARE + EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00)) + add %o0, 0x08, %o0 + bne,pt %icc, 1b + sllx %g3, %g1, %o4 + srl %g1, 3, %g1 + add %o1, %g1, %o1 + brz,pn %o2, .Lexit + nop + ba,pt %icc, .Lsmall_unaligned + +.Ltiny: + EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) + subcc %o2, 1, %o2 + be,pn %icc, .Lexit + EX_ST(STORE(stb, %g1, %o0 + 0x00)) + EX_LD(LOAD(ldub, %o1 + 0x01, %g1)) + subcc %o2, 1, %o2 + be,pn %icc, .Lexit + EX_ST(STORE(stb, %g1, %o0 + 0x01)) + EX_LD(LOAD(ldub, %o1 + 0x02, %g1)) + ba,pt %icc, .Lexit + EX_ST(STORE(stb, %g1, %o0 + 0x02)) + +.Lsmall: + andcc %g2, 0x3, %g0 + bne,pn %icc, .Lsmall_unaligned + andn %o2, 0x4 - 1, %o5 + sub %o2, %o5, %o2 +1: + EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) + add %o1, 0x04, %o1 + subcc %o5, 0x04, %o5 + add %o0, 0x04, %o0 + bne,pt %icc, 1b + EX_ST(STORE(stw, %g1, %o0 - 0x04)) + brz,pt %o2, .Lexit + nop + ba,a,pt %icc, .Ltiny + +.Lsmall_unaligned: +1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) + add %o1, 1, %o1 + add %o0, 1, %o0 + subcc %o2, 1, %o2 + bne,pt %icc, 1b + EX_ST(STORE(stb, %g1, %o0 - 0x01)) + ba,a,pt %icc, .Lexit + .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc/lib/NG4memset.S b/arch/sparc/lib/NG4memset.S new file mode 100644 index 00000000000..41da4bdd95c --- /dev/null +++ b/arch/sparc/lib/NG4memset.S @@ -0,0 +1,105 @@ +/* NG4memset.S: Niagara-4 optimized memset/bzero. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#include <asm/asi.h> + + .register %g2, #scratch + .register %g3, #scratch + + .text + .align 32 + .globl NG4memset +NG4memset: + andcc %o1, 0xff, %o4 + be,pt %icc, 1f + mov %o2, %o1 + sllx %o4, 8, %g1 + or %g1, %o4, %o2 + sllx %o2, 16, %g1 + or %g1, %o2, %o2 + sllx %o2, 32, %g1 + ba,pt %icc, 1f + or %g1, %o2, %o4 + .size NG4memset,.-NG4memset + + .align 32 + .globl NG4bzero +NG4bzero: + clr %o4 +1: cmp %o1, 16 + ble %icc, .Ltiny + mov %o0, %o3 + sub %g0, %o0, %g1 + and %g1, 0x7, %g1 + brz,pt %g1, .Laligned8 + sub %o1, %g1, %o1 +1: stb %o4, [%o0 + 0x00] + subcc %g1, 1, %g1 + bne,pt %icc, 1b + add %o0, 1, %o0 +.Laligned8: + cmp %o1, 64 + (64 - 8) + ble .Lmedium + sub %g0, %o0, %g1 + andcc %g1, (64 - 1), %g1 + brz,pn %g1, .Laligned64 + sub %o1, %g1, %o1 +1: stx %o4, [%o0 + 0x00] + subcc %g1, 8, %g1 + bne,pt %icc, 1b + add %o0, 0x8, %o0 +.Laligned64: + andn %o1, 64 - 1, %g1 + sub %o1, %g1, %o1 + brnz,pn %o4, .Lnon_bzero_loop + mov 0x20, %g2 +1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P + subcc %g1, 0x40, %g1 + stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P + bne,pt %icc, 1b + add %o0, 0x40, %o0 +.Lpostloop: + cmp %o1, 8 + bl,pn %icc, .Ltiny + membar #StoreStore|#StoreLoad +.Lmedium: + andn %o1, 0x7, %g1 + sub %o1, %g1, %o1 +1: stx %o4, [%o0 + 0x00] + subcc %g1, 0x8, %g1 + bne,pt %icc, 1b + add %o0, 0x08, %o0 + andcc %o1, 0x4, %g1 + be,pt %icc, .Ltiny + sub %o1, %g1, %o1 + stw %o4, [%o0 + 0x00] + add %o0, 0x4, %o0 +.Ltiny: + cmp %o1, 0 + be,pn %icc, .Lexit +1: subcc %o1, 1, %o1 + stb %o4, [%o0 + 0x00] + bne,pt %icc, 1b + add %o0, 1, %o0 +.Lexit: + retl + mov %o3, %o0 +.Lnon_bzero_loop: + mov 0x08, %g3 + mov 0x28, %o5 +1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P + subcc %g1, 0x40, %g1 + stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P + add %o0, 0x10, %o0 + stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P + bne,pt %icc, 1b + add %o0, 0x30, %o0 + ba,a,pt %icc, .Lpostloop + .size NG4bzero,.-NG4bzero diff --git a/arch/sparc/lib/NG4patch.S b/arch/sparc/lib/NG4patch.S new file mode 100644 index 00000000000..a114cbcf2a4 --- /dev/null +++ b/arch/sparc/lib/NG4patch.S @@ -0,0 +1,54 @@ +/* NG4patch.S: Patch Ultra-I routines with Niagara-4 variant. + * + * Copyright (C) 2012 David S. Miller <davem@davemloft.net> + */ + +#define BRANCH_ALWAYS 0x10680000 +#define NOP 0x01000000 +#define NG_DO_PATCH(OLD, NEW) \ + sethi %hi(NEW), %g1; \ + or %g1, %lo(NEW), %g1; \ + sethi %hi(OLD), %g2; \ + or %g2, %lo(OLD), %g2; \ + sub %g1, %g2, %g1; \ + sethi %hi(BRANCH_ALWAYS), %g3; \ + sll %g1, 11, %g1; \ + srl %g1, 11 + 2, %g1; \ + or %g3, %lo(BRANCH_ALWAYS), %g3; \ + or %g3, %g1, %g3; \ + stw %g3, [%g2]; \ + sethi %hi(NOP), %g3; \ + or %g3, %lo(NOP), %g3; \ + stw %g3, [%g2 + 0x4]; \ + flush %g2; + + .globl niagara4_patch_copyops + .type niagara4_patch_copyops,#function +niagara4_patch_copyops: + NG_DO_PATCH(memcpy, NG4memcpy) + NG_DO_PATCH(___copy_from_user, NG4copy_from_user) + NG_DO_PATCH(___copy_to_user, NG4copy_to_user) + retl + nop + .size niagara4_patch_copyops,.-niagara4_patch_copyops + + .globl niagara4_patch_bzero + .type niagara4_patch_bzero,#function +niagara4_patch_bzero: + NG_DO_PATCH(memset, NG4memset) + NG_DO_PATCH(__bzero, NG4bzero) + NG_DO_PATCH(__clear_user, NGclear_user) + NG_DO_PATCH(tsb_init, NGtsb_init) + retl + nop + .size niagara4_patch_bzero,.-niagara4_patch_bzero + + .globl niagara4_patch_pageops + .type niagara4_patch_pageops,#function +niagara4_patch_pageops: + NG_DO_PATCH(copy_user_page, NG4copy_user_page) + NG_DO_PATCH(_clear_page, NG4clear_page) + NG_DO_PATCH(clear_user_page, NG4clear_user_page) + retl + nop + .size niagara4_patch_pageops,.-niagara4_patch_pageops diff --git a/arch/sparc/lib/NGbzero.S b/arch/sparc/lib/NGbzero.S new file mode 100644 index 00000000000..beab29bf419 --- /dev/null +++ b/arch/sparc/lib/NGbzero.S @@ -0,0 +1,160 @@ +/* NGbzero.S: Niagara optimized memset/clear_user. + * + * Copyright (C) 2006 David S. Miller (davem@davemloft.net) + */ +#include <asm/asi.h> + +#define EX_ST(x,y) \ +98: x,y; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_o1; \ + .text; \ + .align 4; + + .text + + .globl NGmemset + .type NGmemset, #function +NGmemset: /* %o0=buf, %o1=pat, %o2=len */ + and %o1, 0xff, %o3 + mov %o2, %o1 + sllx %o3, 8, %g1 + or %g1, %o3, %o2 + sllx %o2, 16, %g1 + or %g1, %o2, %o2 + sllx %o2, 32, %g1 + ba,pt %xcc, 1f + or %g1, %o2, %o2 + + .globl NGbzero + .type NGbzero, #function +NGbzero: + clr %o2 +1: brz,pn %o1, NGbzero_return + mov %o0, %o3 + + /* %o5: saved %asi, restored at NGbzero_done + * %g7: store-init %asi to use + * %o4: non-store-init %asi to use + */ + rd %asi, %o5 + mov ASI_BLK_INIT_QUAD_LDD_P, %g7 + mov ASI_P, %o4 + wr %o4, 0x0, %asi + +NGbzero_from_clear_user: + cmp %o1, 15 + bl,pn %icc, NGbzero_tiny + andcc %o0, 0x7, %g1 + be,pt %xcc, 2f + mov 8, %g2 + sub %g2, %g1, %g1 + sub %o1, %g1, %o1 +1: EX_ST(stba %o2, [%o0 + 0x00] %asi) + subcc %g1, 1, %g1 + bne,pt %xcc, 1b + add %o0, 1, %o0 +2: cmp %o1, 128 + bl,pn %icc, NGbzero_medium + andcc %o0, (64 - 1), %g1 + be,pt %xcc, NGbzero_pre_loop + mov 64, %g2 + sub %g2, %g1, %g1 + sub %o1, %g1, %o1 +1: EX_ST(stxa %o2, [%o0 + 0x00] %asi) + subcc %g1, 8, %g1 + bne,pt %xcc, 1b + add %o0, 8, %o0 + +NGbzero_pre_loop: + wr %g7, 0x0, %asi + andn %o1, (64 - 1), %g1 + sub %o1, %g1, %o1 +NGbzero_loop: + EX_ST(stxa %o2, [%o0 + 0x00] %asi) + EX_ST(stxa %o2, [%o0 + 0x08] %asi) + EX_ST(stxa %o2, [%o0 + 0x10] %asi) + EX_ST(stxa %o2, [%o0 + 0x18] %asi) + EX_ST(stxa %o2, [%o0 + 0x20] %asi) + EX_ST(stxa %o2, [%o0 + 0x28] %asi) + EX_ST(stxa %o2, [%o0 + 0x30] %asi) + EX_ST(stxa %o2, [%o0 + 0x38] %asi) + subcc %g1, 64, %g1 + bne,pt %xcc, NGbzero_loop + add %o0, 64, %o0 + + membar #Sync + wr %o4, 0x0, %asi + brz,pn %o1, NGbzero_done +NGbzero_medium: + andncc %o1, 0x7, %g1 + be,pn %xcc, 2f + sub %o1, %g1, %o1 +1: EX_ST(stxa %o2, [%o0 + 0x00] %asi) + subcc %g1, 8, %g1 + bne,pt %xcc, 1b + add %o0, 8, %o0 +2: brz,pt %o1, NGbzero_done + nop + +NGbzero_tiny: +1: EX_ST(stba %o2, [%o0 + 0x00] %asi) + subcc %o1, 1, %o1 + bne,pt %icc, 1b + add %o0, 1, %o0 + + /* fallthrough */ + +NGbzero_done: + wr %o5, 0x0, %asi + +NGbzero_return: + retl + mov %o3, %o0 + .size NGbzero, .-NGbzero + .size NGmemset, .-NGmemset + + .globl NGclear_user + .type NGclear_user, #function +NGclear_user: /* %o0=buf, %o1=len */ + rd %asi, %o5 + brz,pn %o1, NGbzero_done + clr %o3 + cmp %o5, ASI_AIUS + bne,pn %icc, NGbzero + clr %o2 + mov ASI_BLK_INIT_QUAD_LDD_AIUS, %g7 + ba,pt %xcc, NGbzero_from_clear_user + mov ASI_AIUS, %o4 + .size NGclear_user, .-NGclear_user + +#define BRANCH_ALWAYS 0x10680000 +#define NOP 0x01000000 +#define NG_DO_PATCH(OLD, NEW) \ + sethi %hi(NEW), %g1; \ + or %g1, %lo(NEW), %g1; \ + sethi %hi(OLD), %g2; \ + or %g2, %lo(OLD), %g2; \ + sub %g1, %g2, %g1; \ + sethi %hi(BRANCH_ALWAYS), %g3; \ + sll %g1, 11, %g1; \ + srl %g1, 11 + 2, %g1; \ + or %g3, %lo(BRANCH_ALWAYS), %g3; \ + or %g3, %g1, %g3; \ + stw %g3, [%g2]; \ + sethi %hi(NOP), %g3; \ + or %g3, %lo(NOP), %g3; \ + stw %g3, [%g2 + 0x4]; \ + flush %g2; + + .globl niagara_patch_bzero + .type niagara_patch_bzero,#function +niagara_patch_bzero: + NG_DO_PATCH(memset, NGmemset) + NG_DO_PATCH(__bzero, NGbzero) + NG_DO_PATCH(__clear_user, NGclear_user) + NG_DO_PATCH(tsb_init, NGtsb_init) + retl + nop + .size niagara_patch_bzero,.-niagara_patch_bzero diff --git a/arch/sparc/lib/NGcopy_from_user.S b/arch/sparc/lib/NGcopy_from_user.S new file mode 100644 index 00000000000..5d1e4d1ac21 --- /dev/null +++ b/arch/sparc/lib/NGcopy_from_user.S @@ -0,0 +1,32 @@ +/* NGcopy_from_user.S: Niagara optimized copy from userspace. + * + * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) + */ + +#define EX_LD(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __ret_one_asi;\ + .text; \ + .align 4; + +#ifndef ASI_AIUS +#define ASI_AIUS 0x11 +#endif + +#define FUNC_NAME NGcopy_from_user +#define LOAD(type,addr,dest) type##a [addr] ASI_AIUS, dest +#define LOAD_TWIN(addr_reg,dest0,dest1) \ + ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_AIUS, dest0 +#define EX_RETVAL(x) %g0 + +#ifdef __KERNEL__ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop +#endif + +#include "NGmemcpy.S" diff --git a/arch/sparc/lib/NGcopy_to_user.S b/arch/sparc/lib/NGcopy_to_user.S new file mode 100644 index 00000000000..ff630dcb273 --- /dev/null +++ b/arch/sparc/lib/NGcopy_to_user.S @@ -0,0 +1,35 @@ +/* NGcopy_to_user.S: Niagara optimized copy to userspace. + * + * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) + */ + +#define EX_ST(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __ret_one_asi;\ + .text; \ + .align 4; + +#ifndef ASI_AIUS +#define ASI_AIUS 0x11 +#endif + +#define FUNC_NAME NGcopy_to_user +#define STORE(type,src,addr) type##a src, [addr] ASI_AIUS +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS +#define EX_RETVAL(x) %g0 + +#ifdef __KERNEL__ + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop +#endif + +#include "NGmemcpy.S" diff --git a/arch/sparc/lib/NGmemcpy.S b/arch/sparc/lib/NGmemcpy.S new file mode 100644 index 00000000000..96a14caf696 --- /dev/null +++ b/arch/sparc/lib/NGmemcpy.S @@ -0,0 +1,425 @@ +/* NGmemcpy.S: Niagara optimized memcpy. + * + * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) + */ + +#ifdef __KERNEL__ +#include <asm/asi.h> +#include <asm/thread_info.h> +#define GLOBAL_SPARE %g7 +#define RESTORE_ASI(TMP) \ + ldub [%g6 + TI_CURRENT_DS], TMP; \ + wr TMP, 0x0, %asi; +#else +#define GLOBAL_SPARE %g5 +#define RESTORE_ASI(TMP) \ + wr %g0, ASI_PNF, %asi +#endif + +#ifdef __sparc_v9__ +#define SAVE_AMOUNT 128 +#else +#define SAVE_AMOUNT 64 +#endif + +#ifndef STORE_ASI +#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P +#endif + +#ifndef EX_LD +#define EX_LD(x) x +#endif + +#ifndef EX_ST +#define EX_ST(x) x +#endif + +#ifndef EX_RETVAL +#define EX_RETVAL(x) x +#endif + +#ifndef LOAD +#ifndef MEMCPY_DEBUG +#define LOAD(type,addr,dest) type [addr], dest +#else +#define LOAD(type,addr,dest) type##a [addr] 0x80, dest +#endif +#endif + +#ifndef LOAD_TWIN +#define LOAD_TWIN(addr_reg,dest0,dest1) \ + ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 +#endif + +#ifndef STORE +#define STORE(type,src,addr) type src, [addr] +#endif + +#ifndef STORE_INIT +#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA +#define STORE_INIT(src,addr) stxa src, [addr] %asi +#else +#define STORE_INIT(src,addr) stx src, [addr + 0x00] +#endif +#endif + +#ifndef FUNC_NAME +#define FUNC_NAME NGmemcpy +#endif + +#ifndef PREAMBLE +#define PREAMBLE +#endif + +#ifndef XCC +#define XCC xcc +#endif + + .register %g2,#scratch + .register %g3,#scratch + + .text + .align 64 + + .globl FUNC_NAME + .type FUNC_NAME,#function +FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ + PREAMBLE + save %sp, -SAVE_AMOUNT, %sp + srlx %i2, 31, %g2 + cmp %g2, 0 + tne %xcc, 5 + mov %i0, %o0 + cmp %i2, 0 + be,pn %XCC, 85f + or %o0, %i1, %i3 + cmp %i2, 16 + blu,a,pn %XCC, 80f + or %i3, %i2, %i3 + + /* 2 blocks (128 bytes) is the minimum we can do the block + * copy with. We need to ensure that we'll iterate at least + * once in the block copy loop. At worst we'll need to align + * the destination to a 64-byte boundary which can chew up + * to (64 - 1) bytes from the length before we perform the + * block copy loop. + */ + cmp %i2, (2 * 64) + blu,pt %XCC, 70f + andcc %i3, 0x7, %g0 + + /* %o0: dst + * %i1: src + * %i2: len (known to be >= 128) + * + * The block copy loops will use %i4/%i5,%g2/%g3 as + * temporaries while copying the data. + */ + + LOAD(prefetch, %i1, #one_read) + wr %g0, STORE_ASI, %asi + + /* Align destination on 64-byte boundary. */ + andcc %o0, (64 - 1), %i4 + be,pt %XCC, 2f + sub %i4, 64, %i4 + sub %g0, %i4, %i4 ! bytes to align dst + sub %i2, %i4, %i2 +1: subcc %i4, 1, %i4 + EX_LD(LOAD(ldub, %i1, %g1)) + EX_ST(STORE(stb, %g1, %o0)) + add %i1, 1, %i1 + bne,pt %XCC, 1b + add %o0, 1, %o0 + + /* If the source is on a 16-byte boundary we can do + * the direct block copy loop. If it is 8-byte aligned + * we can do the 16-byte loads offset by -8 bytes and the + * init stores offset by one register. + * + * If the source is not even 8-byte aligned, we need to do + * shifting and masking (basically integer faligndata). + * + * The careful bit with init stores is that if we store + * to any part of the cache line we have to store the whole + * cacheline else we can end up with corrupt L2 cache line + * contents. Since the loop works on 64-bytes of 64-byte + * aligned store data at a time, this is easy to ensure. + */ +2: + andcc %i1, (16 - 1), %i4 + andn %i2, (64 - 1), %g1 ! block copy loop iterator + be,pt %XCC, 50f + sub %i2, %g1, %i2 ! final sub-block copy bytes + + cmp %i4, 8 + be,pt %XCC, 10f + sub %i1, %i4, %i1 + + /* Neither 8-byte nor 16-byte aligned, shift and mask. */ + and %i4, 0x7, GLOBAL_SPARE + sll GLOBAL_SPARE, 3, GLOBAL_SPARE + mov 64, %i5 + EX_LD(LOAD_TWIN(%i1, %g2, %g3)) + sub %i5, GLOBAL_SPARE, %i5 + mov 16, %o4 + mov 32, %o5 + mov 48, %o7 + mov 64, %i3 + + bg,pn %XCC, 9f + nop + +#define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ + sllx WORD1, POST_SHIFT, WORD1; \ + srlx WORD2, PRE_SHIFT, TMP; \ + sllx WORD2, POST_SHIFT, WORD2; \ + or WORD1, TMP, WORD1; \ + srlx WORD3, PRE_SHIFT, TMP; \ + or WORD2, TMP, WORD2; + +8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3)) + MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) + LOAD(prefetch, %i1 + %i3, #one_read) + + EX_ST(STORE_INIT(%g2, %o0 + 0x00)) + EX_ST(STORE_INIT(%g3, %o0 + 0x08)) + + EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3)) + MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) + + EX_ST(STORE_INIT(%o2, %o0 + 0x10)) + EX_ST(STORE_INIT(%o3, %o0 + 0x18)) + + EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) + MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) + + EX_ST(STORE_INIT(%g2, %o0 + 0x20)) + EX_ST(STORE_INIT(%g3, %o0 + 0x28)) + + EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3)) + add %i1, 64, %i1 + MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) + + EX_ST(STORE_INIT(%o2, %o0 + 0x30)) + EX_ST(STORE_INIT(%o3, %o0 + 0x38)) + + subcc %g1, 64, %g1 + bne,pt %XCC, 8b + add %o0, 64, %o0 + + ba,pt %XCC, 60f + add %i1, %i4, %i1 + +9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3)) + MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) + LOAD(prefetch, %i1 + %i3, #one_read) + + EX_ST(STORE_INIT(%g3, %o0 + 0x00)) + EX_ST(STORE_INIT(%o2, %o0 + 0x08)) + + EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3)) + MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) + + EX_ST(STORE_INIT(%o3, %o0 + 0x10)) + EX_ST(STORE_INIT(%g2, %o0 + 0x18)) + + EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) + MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) + + EX_ST(STORE_INIT(%g3, %o0 + 0x20)) + EX_ST(STORE_INIT(%o2, %o0 + 0x28)) + + EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3)) + add %i1, 64, %i1 + MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) + + EX_ST(STORE_INIT(%o3, %o0 + 0x30)) + EX_ST(STORE_INIT(%g2, %o0 + 0x38)) + + subcc %g1, 64, %g1 + bne,pt %XCC, 9b + add %o0, 64, %o0 + + ba,pt %XCC, 60f + add %i1, %i4, %i1 + +10: /* Destination is 64-byte aligned, source was only 8-byte + * aligned but it has been subtracted by 8 and we perform + * one twin load ahead, then add 8 back into source when + * we finish the loop. + */ + EX_LD(LOAD_TWIN(%i1, %o4, %o5)) + mov 16, %o7 + mov 32, %g2 + mov 48, %g3 + mov 64, %o1 +1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) + LOAD(prefetch, %i1 + %o1, #one_read) + EX_ST(STORE_INIT(%o5, %o0 + 0x00)) ! initializes cache line + EX_ST(STORE_INIT(%o2, %o0 + 0x08)) + EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5)) + EX_ST(STORE_INIT(%o3, %o0 + 0x10)) + EX_ST(STORE_INIT(%o4, %o0 + 0x18)) + EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3)) + EX_ST(STORE_INIT(%o5, %o0 + 0x20)) + EX_ST(STORE_INIT(%o2, %o0 + 0x28)) + EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5)) + add %i1, 64, %i1 + EX_ST(STORE_INIT(%o3, %o0 + 0x30)) + EX_ST(STORE_INIT(%o4, %o0 + 0x38)) + subcc %g1, 64, %g1 + bne,pt %XCC, 1b + add %o0, 64, %o0 + + ba,pt %XCC, 60f + add %i1, 0x8, %i1 + +50: /* Destination is 64-byte aligned, and source is 16-byte + * aligned. + */ + mov 16, %o7 + mov 32, %g2 + mov 48, %g3 + mov 64, %o1 +1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5)) + EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) + LOAD(prefetch, %i1 + %o1, #one_read) + EX_ST(STORE_INIT(%o4, %o0 + 0x00)) ! initializes cache line + EX_ST(STORE_INIT(%o5, %o0 + 0x08)) + EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5)) + EX_ST(STORE_INIT(%o2, %o0 + 0x10)) + EX_ST(STORE_INIT(%o3, %o0 + 0x18)) + EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3)) + add %i1, 64, %i1 + EX_ST(STORE_INIT(%o4, %o0 + 0x20)) + EX_ST(STORE_INIT(%o5, %o0 + 0x28)) + EX_ST(STORE_INIT(%o2, %o0 + 0x30)) + EX_ST(STORE_INIT(%o3, %o0 + 0x38)) + subcc %g1, 64, %g1 + bne,pt %XCC, 1b + add %o0, 64, %o0 + /* fall through */ + +60: + membar #Sync + + /* %i2 contains any final bytes still needed to be copied + * over. If anything is left, we copy it one byte at a time. + */ + RESTORE_ASI(%i3) + brz,pt %i2, 85f + sub %o0, %i1, %i3 + ba,a,pt %XCC, 90f + + .align 64 +70: /* 16 < len <= 64 */ + bne,pn %XCC, 75f + sub %o0, %i1, %i3 + +72: + andn %i2, 0xf, %i4 + and %i2, 0xf, %i2 +1: subcc %i4, 0x10, %i4 + EX_LD(LOAD(ldx, %i1, %o4)) + add %i1, 0x08, %i1 + EX_LD(LOAD(ldx, %i1, %g1)) + sub %i1, 0x08, %i1 + EX_ST(STORE(stx, %o4, %i1 + %i3)) + add %i1, 0x8, %i1 + EX_ST(STORE(stx, %g1, %i1 + %i3)) + bgu,pt %XCC, 1b + add %i1, 0x8, %i1 +73: andcc %i2, 0x8, %g0 + be,pt %XCC, 1f + nop + sub %i2, 0x8, %i2 + EX_LD(LOAD(ldx, %i1, %o4)) + EX_ST(STORE(stx, %o4, %i1 + %i3)) + add %i1, 0x8, %i1 +1: andcc %i2, 0x4, %g0 + be,pt %XCC, 1f + nop + sub %i2, 0x4, %i2 + EX_LD(LOAD(lduw, %i1, %i5)) + EX_ST(STORE(stw, %i5, %i1 + %i3)) + add %i1, 0x4, %i1 +1: cmp %i2, 0 + be,pt %XCC, 85f + nop + ba,pt %xcc, 90f + nop + +75: + andcc %o0, 0x7, %g1 + sub %g1, 0x8, %g1 + be,pn %icc, 2f + sub %g0, %g1, %g1 + sub %i2, %g1, %i2 + +1: subcc %g1, 1, %g1 + EX_LD(LOAD(ldub, %i1, %i5)) + EX_ST(STORE(stb, %i5, %i1 + %i3)) + bgu,pt %icc, 1b + add %i1, 1, %i1 + +2: add %i1, %i3, %o0 + andcc %i1, 0x7, %g1 + bne,pt %icc, 8f + sll %g1, 3, %g1 + + cmp %i2, 16 + bgeu,pt %icc, 72b + nop + ba,a,pt %xcc, 73b + +8: mov 64, %i3 + andn %i1, 0x7, %i1 + EX_LD(LOAD(ldx, %i1, %g2)) + sub %i3, %g1, %i3 + andn %i2, 0x7, %i4 + sllx %g2, %g1, %g2 +1: add %i1, 0x8, %i1 + EX_LD(LOAD(ldx, %i1, %g3)) + subcc %i4, 0x8, %i4 + srlx %g3, %i3, %i5 + or %i5, %g2, %i5 + EX_ST(STORE(stx, %i5, %o0)) + add %o0, 0x8, %o0 + bgu,pt %icc, 1b + sllx %g3, %g1, %g2 + + srl %g1, 3, %g1 + andcc %i2, 0x7, %i2 + be,pn %icc, 85f + add %i1, %g1, %i1 + ba,pt %xcc, 90f + sub %o0, %i1, %i3 + + .align 64 +80: /* 0 < len <= 16 */ + andcc %i3, 0x3, %g0 + bne,pn %XCC, 90f + sub %o0, %i1, %i3 + +1: + subcc %i2, 4, %i2 + EX_LD(LOAD(lduw, %i1, %g1)) + EX_ST(STORE(stw, %g1, %i1 + %i3)) + bgu,pt %XCC, 1b + add %i1, 4, %i1 + +85: ret + restore EX_RETVAL(%i0), %g0, %o0 + + .align 32 +90: + subcc %i2, 1, %i2 + EX_LD(LOAD(ldub, %i1, %g1)) + EX_ST(STORE(stb, %g1, %i1 + %i3)) + bgu,pt %XCC, 90b + add %i1, 1, %i1 + ret + restore EX_RETVAL(%i0), %g0, %o0 + + .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc/lib/NGpage.S b/arch/sparc/lib/NGpage.S new file mode 100644 index 00000000000..423d46e2258 --- /dev/null +++ b/arch/sparc/lib/NGpage.S @@ -0,0 +1,137 @@ +/* NGpage.S: Niagara optimize clear and copy page. + * + * Copyright (C) 2006 (davem@davemloft.net) + */ + +#include <asm/asi.h> +#include <asm/page.h> + + .text + .align 32 + + /* This is heavily simplified from the sun4u variants + * because Niagara does not have any D-cache aliasing issues + * and also we don't need to use the FPU in order to implement + * an optimal page copy/clear. + */ + +NGcopy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ + save %sp, -192, %sp + rd %asi, %g3 + wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi + set PAGE_SIZE, %g7 + prefetch [%i1 + 0x00], #one_read + prefetch [%i1 + 0x40], #one_read + +1: prefetch [%i1 + 0x80], #one_read + prefetch [%i1 + 0xc0], #one_read + ldda [%i1 + 0x00] %asi, %o2 + ldda [%i1 + 0x10] %asi, %o4 + ldda [%i1 + 0x20] %asi, %l2 + ldda [%i1 + 0x30] %asi, %l4 + stxa %o2, [%i0 + 0x00] %asi + stxa %o3, [%i0 + 0x08] %asi + stxa %o4, [%i0 + 0x10] %asi + stxa %o5, [%i0 + 0x18] %asi + stxa %l2, [%i0 + 0x20] %asi + stxa %l3, [%i0 + 0x28] %asi + stxa %l4, [%i0 + 0x30] %asi + stxa %l5, [%i0 + 0x38] %asi + ldda [%i1 + 0x40] %asi, %o2 + ldda [%i1 + 0x50] %asi, %o4 + ldda [%i1 + 0x60] %asi, %l2 + ldda [%i1 + 0x70] %asi, %l4 + stxa %o2, [%i0 + 0x40] %asi + stxa %o3, [%i0 + 0x48] %asi + stxa %o4, [%i0 + 0x50] %asi + stxa %o5, [%i0 + 0x58] %asi + stxa %l2, [%i0 + 0x60] %asi + stxa %l3, [%i0 + 0x68] %asi + stxa %l4, [%i0 + 0x70] %asi + stxa %l5, [%i0 + 0x78] %asi + add %i1, 128, %i1 + subcc %g7, 128, %g7 + bne,pt %xcc, 1b + add %i0, 128, %i0 + wr %g3, 0x0, %asi + membar #Sync + ret + restore + + .align 32 + .globl NGclear_page + .globl NGclear_user_page +NGclear_page: /* %o0=dest */ +NGclear_user_page: /* %o0=dest, %o1=vaddr */ + rd %asi, %g3 + wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi + set PAGE_SIZE, %g7 + +1: stxa %g0, [%o0 + 0x00] %asi + stxa %g0, [%o0 + 0x08] %asi + stxa %g0, [%o0 + 0x10] %asi + stxa %g0, [%o0 + 0x18] %asi + stxa %g0, [%o0 + 0x20] %asi + stxa %g0, [%o0 + 0x28] %asi + stxa %g0, [%o0 + 0x30] %asi + stxa %g0, [%o0 + 0x38] %asi + stxa %g0, [%o0 + 0x40] %asi + stxa %g0, [%o0 + 0x48] %asi + stxa %g0, [%o0 + 0x50] %asi + stxa %g0, [%o0 + 0x58] %asi + stxa %g0, [%o0 + 0x60] %asi + stxa %g0, [%o0 + 0x68] %asi + stxa %g0, [%o0 + 0x70] %asi + stxa %g0, [%o0 + 0x78] %asi + stxa %g0, [%o0 + 0x80] %asi + stxa %g0, [%o0 + 0x88] %asi + stxa %g0, [%o0 + 0x90] %asi + stxa %g0, [%o0 + 0x98] %asi + stxa %g0, [%o0 + 0xa0] %asi + stxa %g0, [%o0 + 0xa8] %asi + stxa %g0, [%o0 + 0xb0] %asi + stxa %g0, [%o0 + 0xb8] %asi + stxa %g0, [%o0 + 0xc0] %asi + stxa %g0, [%o0 + 0xc8] %asi + stxa %g0, [%o0 + 0xd0] %asi + stxa %g0, [%o0 + 0xd8] %asi + stxa %g0, [%o0 + 0xe0] %asi + stxa %g0, [%o0 + 0xe8] %asi + stxa %g0, [%o0 + 0xf0] %asi + stxa %g0, [%o0 + 0xf8] %asi + subcc %g7, 256, %g7 + bne,pt %xcc, 1b + add %o0, 256, %o0 + wr %g3, 0x0, %asi + membar #Sync + retl + nop + +#define BRANCH_ALWAYS 0x10680000 +#define NOP 0x01000000 +#define NG_DO_PATCH(OLD, NEW) \ + sethi %hi(NEW), %g1; \ + or %g1, %lo(NEW), %g1; \ + sethi %hi(OLD), %g2; \ + or %g2, %lo(OLD), %g2; \ + sub %g1, %g2, %g1; \ + sethi %hi(BRANCH_ALWAYS), %g3; \ + sll %g1, 11, %g1; \ + srl %g1, 11 + 2, %g1; \ + or %g3, %lo(BRANCH_ALWAYS), %g3; \ + or %g3, %g1, %g3; \ + stw %g3, [%g2]; \ + sethi %hi(NOP), %g3; \ + or %g3, %lo(NOP), %g3; \ + stw %g3, [%g2 + 0x4]; \ + flush %g2; + + .globl niagara_patch_pageops + .type niagara_patch_pageops,#function +niagara_patch_pageops: + NG_DO_PATCH(copy_user_page, NGcopy_user_page) + NG_DO_PATCH(_clear_page, NGclear_page) + NG_DO_PATCH(clear_user_page, NGclear_user_page) + retl + nop + .size niagara_patch_pageops,.-niagara_patch_pageops diff --git a/arch/sparc/lib/NGpatch.S b/arch/sparc/lib/NGpatch.S new file mode 100644 index 00000000000..3b0674fc336 --- /dev/null +++ b/arch/sparc/lib/NGpatch.S @@ -0,0 +1,33 @@ +/* NGpatch.S: Patch Ultra-I routines with Niagara variant. + * + * Copyright (C) 2006 David S. Miller <davem@davemloft.net> + */ + +#define BRANCH_ALWAYS 0x10680000 +#define NOP 0x01000000 +#define NG_DO_PATCH(OLD, NEW) \ + sethi %hi(NEW), %g1; \ + or %g1, %lo(NEW), %g1; \ + sethi %hi(OLD), %g2; \ + or %g2, %lo(OLD), %g2; \ + sub %g1, %g2, %g1; \ + sethi %hi(BRANCH_ALWAYS), %g3; \ + sll %g1, 11, %g1; \ + srl %g1, 11 + 2, %g1; \ + or %g3, %lo(BRANCH_ALWAYS), %g3; \ + or %g3, %g1, %g3; \ + stw %g3, [%g2]; \ + sethi %hi(NOP), %g3; \ + or %g3, %lo(NOP), %g3; \ + stw %g3, [%g2 + 0x4]; \ + flush %g2; + + .globl niagara_patch_copyops + .type niagara_patch_copyops,#function +niagara_patch_copyops: + NG_DO_PATCH(memcpy, NGmemcpy) + NG_DO_PATCH(___copy_from_user, NGcopy_from_user) + NG_DO_PATCH(___copy_to_user, NGcopy_to_user) + retl + nop + .size niagara_patch_copyops,.-niagara_patch_copyops diff --git a/arch/sparc/lib/PeeCeeI.c b/arch/sparc/lib/PeeCeeI.c new file mode 100644 index 00000000000..6529f865759 --- /dev/null +++ b/arch/sparc/lib/PeeCeeI.c @@ -0,0 +1,211 @@ +/* + * PeeCeeI.c: The emerging standard... + * + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + */ + +#include <linux/module.h> + +#include <asm/io.h> +#include <asm/byteorder.h> + +void outsb(unsigned long __addr, const void *src, unsigned long count) +{ + void __iomem *addr = (void __iomem *) __addr; + const u8 *p = src; + + while (count--) + outb(*p++, addr); +} +EXPORT_SYMBOL(outsb); + +void outsw(unsigned long __addr, const void *src, unsigned long count) +{ + void __iomem *addr = (void __iomem *) __addr; + + while (count--) { + __raw_writew(*(u16 *)src, addr); + src += sizeof(u16); + } +} +EXPORT_SYMBOL(outsw); + +void outsl(unsigned long __addr, const void *src, unsigned long count) +{ + void __iomem *addr = (void __iomem *) __addr; + u32 l, l2; + + if (!count) + return; + + switch (((unsigned long)src) & 0x3) { + case 0x0: + /* src is naturally aligned */ + while (count--) { + __raw_writel(*(u32 *)src, addr); + src += sizeof(u32); + } + break; + case 0x2: + /* 2-byte alignment */ + while (count--) { + l = (*(u16 *)src) << 16; + l |= *(u16 *)(src + sizeof(u16)); + __raw_writel(l, addr); + src += sizeof(u32); + } + break; + case 0x1: + /* Hold three bytes in l each time, grab a byte from l2 */ + l = (*(u8 *)src) << 24; + l |= (*(u16 *)(src + sizeof(u8))) << 8; + src += sizeof(u8) + sizeof(u16); + while (count--) { + l2 = *(u32 *)src; + l |= (l2 >> 24); + __raw_writel(l, addr); + l = l2 << 8; + src += sizeof(u32); + } + break; + case 0x3: + /* Hold a byte in l each time, grab 3 bytes from l2 */ + l = (*(u8 *)src) << 24; + src += sizeof(u8); + while (count--) { + l2 = *(u32 *)src; + l |= (l2 >> 8); + __raw_writel(l, addr); + l = l2 << 24; + src += sizeof(u32); + } + break; + } +} +EXPORT_SYMBOL(outsl); + +void insb(unsigned long __addr, void *dst, unsigned long count) +{ + void __iomem *addr = (void __iomem *) __addr; + + if (count) { + u32 *pi; + u8 *pb = dst; + + while ((((unsigned long)pb) & 0x3) && count--) + *pb++ = inb(addr); + pi = (u32 *)pb; + while (count >= 4) { + u32 w; + + w = (inb(addr) << 24); + w |= (inb(addr) << 16); + w |= (inb(addr) << 8); + w |= (inb(addr) << 0); + *pi++ = w; + count -= 4; + } + pb = (u8 *)pi; + while (count--) + *pb++ = inb(addr); + } +} +EXPORT_SYMBOL(insb); + +void insw(unsigned long __addr, void *dst, unsigned long count) +{ + void __iomem *addr = (void __iomem *) __addr; + + if (count) { + u16 *ps = dst; + u32 *pi; + + if (((unsigned long)ps) & 0x2) { + *ps++ = le16_to_cpu(inw(addr)); + count--; + } + pi = (u32 *)ps; + while (count >= 2) { + u32 w; + + w = (le16_to_cpu(inw(addr)) << 16); + w |= (le16_to_cpu(inw(addr)) << 0); + *pi++ = w; + count -= 2; + } + ps = (u16 *)pi; + if (count) + *ps = le16_to_cpu(inw(addr)); + } +} +EXPORT_SYMBOL(insw); + +void insl(unsigned long __addr, void *dst, unsigned long count) +{ + void __iomem *addr = (void __iomem *) __addr; + + if (count) { + if ((((unsigned long)dst) & 0x3) == 0) { + u32 *pi = dst; + while (count--) + *pi++ = le32_to_cpu(inl(addr)); + } else { + u32 l = 0, l2, *pi; + u16 *ps; + u8 *pb; + + switch (((unsigned long)dst) & 3) { + case 0x2: + ps = dst; + count -= 1; + l = le32_to_cpu(inl(addr)); + *ps++ = l; + pi = (u32 *)ps; + while (count--) { + l2 = le32_to_cpu(inl(addr)); + *pi++ = (l << 16) | (l2 >> 16); + l = l2; + } + ps = (u16 *)pi; + *ps = l; + break; + + case 0x1: + pb = dst; + count -= 1; + l = le32_to_cpu(inl(addr)); + *pb++ = l >> 24; + ps = (u16 *)pb; + *ps++ = ((l >> 8) & 0xffff); + pi = (u32 *)ps; + while (count--) { + l2 = le32_to_cpu(inl(addr)); + *pi++ = (l << 24) | (l2 >> 8); + l = l2; + } + pb = (u8 *)pi; + *pb = l; + break; + + case 0x3: + pb = (u8 *)dst; + count -= 1; + l = le32_to_cpu(inl(addr)); + *pb++ = l >> 24; + pi = (u32 *)pb; + while (count--) { + l2 = le32_to_cpu(inl(addr)); + *pi++ = (l << 8) | (l2 >> 24); + l = l2; + } + ps = (u16 *)pi; + *ps++ = ((l >> 8) & 0xffff); + pb = (u8 *)ps; + *pb = l; + break; + } + } + } +} +EXPORT_SYMBOL(insl); + diff --git a/arch/sparc/lib/U1copy_from_user.S b/arch/sparc/lib/U1copy_from_user.S new file mode 100644 index 00000000000..a6ae2ea04bf --- /dev/null +++ b/arch/sparc/lib/U1copy_from_user.S @@ -0,0 +1,29 @@ +/* U1copy_from_user.S: UltraSparc-I/II/IIi/IIe optimized copy from userspace. + * + * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) + */ + +#define EX_LD(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one; \ + .text; \ + .align 4; + +#define FUNC_NAME ___copy_from_user +#define LOAD(type,addr,dest) type##a [addr] %asi, dest +#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_AIUS, dest +#define EX_RETVAL(x) 0 + + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop; \ + +#include "U1memcpy.S" diff --git a/arch/sparc/lib/U1copy_to_user.S b/arch/sparc/lib/U1copy_to_user.S new file mode 100644 index 00000000000..f4b970eeb48 --- /dev/null +++ b/arch/sparc/lib/U1copy_to_user.S @@ -0,0 +1,29 @@ +/* U1copy_to_user.S: UltraSparc-I/II/IIi/IIe optimized copy to userspace. + * + * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) + */ + +#define EX_ST(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one; \ + .text; \ + .align 4; + +#define FUNC_NAME ___copy_to_user +#define STORE(type,src,addr) type##a src, [addr] ASI_AIUS +#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_AIUS +#define EX_RETVAL(x) 0 + + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop; \ + +#include "U1memcpy.S" diff --git a/arch/sparc/lib/U1memcpy.S b/arch/sparc/lib/U1memcpy.S new file mode 100644 index 00000000000..b67142b7768 --- /dev/null +++ b/arch/sparc/lib/U1memcpy.S @@ -0,0 +1,563 @@ +/* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy. + * + * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com) + * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz) + */ + +#ifdef __KERNEL__ +#include <asm/visasm.h> +#include <asm/asi.h> +#define GLOBAL_SPARE g7 +#else +#define GLOBAL_SPARE g5 +#define ASI_BLK_P 0xf0 +#define FPRS_FEF 0x04 +#ifdef MEMCPY_DEBUG +#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ + clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; +#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#else +#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs +#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#endif +#endif + +#ifndef EX_LD +#define EX_LD(x) x +#endif + +#ifndef EX_ST +#define EX_ST(x) x +#endif + +#ifndef EX_RETVAL +#define EX_RETVAL(x) x +#endif + +#ifndef LOAD +#define LOAD(type,addr,dest) type [addr], dest +#endif + +#ifndef LOAD_BLK +#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest +#endif + +#ifndef STORE +#define STORE(type,src,addr) type src, [addr] +#endif + +#ifndef STORE_BLK +#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P +#endif + +#ifndef FUNC_NAME +#define FUNC_NAME memcpy +#endif + +#ifndef PREAMBLE +#define PREAMBLE +#endif + +#ifndef XCC +#define XCC xcc +#endif + +#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \ + faligndata %f1, %f2, %f48; \ + faligndata %f2, %f3, %f50; \ + faligndata %f3, %f4, %f52; \ + faligndata %f4, %f5, %f54; \ + faligndata %f5, %f6, %f56; \ + faligndata %f6, %f7, %f58; \ + faligndata %f7, %f8, %f60; \ + faligndata %f8, %f9, %f62; + +#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt) \ + EX_LD(LOAD_BLK(%src, %fdest)); \ + EX_ST(STORE_BLK(%fsrc, %dest)); \ + add %src, 0x40, %src; \ + subcc %len, 0x40, %len; \ + be,pn %xcc, jmptgt; \ + add %dest, 0x40, %dest; \ + +#define LOOP_CHUNK1(src, dest, len, branch_dest) \ + MAIN_LOOP_CHUNK(src, dest, f0, f48, len, branch_dest) +#define LOOP_CHUNK2(src, dest, len, branch_dest) \ + MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest) +#define LOOP_CHUNK3(src, dest, len, branch_dest) \ + MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest) + +#define DO_SYNC membar #Sync; +#define STORE_SYNC(dest, fsrc) \ + EX_ST(STORE_BLK(%fsrc, %dest)); \ + add %dest, 0x40, %dest; \ + DO_SYNC + +#define STORE_JUMP(dest, fsrc, target) \ + EX_ST(STORE_BLK(%fsrc, %dest)); \ + add %dest, 0x40, %dest; \ + ba,pt %xcc, target; \ + nop; + +#define FINISH_VISCHUNK(dest, f0, f1, left) \ + subcc %left, 8, %left;\ + bl,pn %xcc, 95f; \ + faligndata %f0, %f1, %f48; \ + EX_ST(STORE(std, %f48, %dest)); \ + add %dest, 8, %dest; + +#define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ + subcc %left, 8, %left; \ + bl,pn %xcc, 95f; \ + fsrc2 %f0, %f1; + +#define UNEVEN_VISCHUNK(dest, f0, f1, left) \ + UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ + ba,a,pt %xcc, 93f; + + .register %g2,#scratch + .register %g3,#scratch + + .text + .align 64 + + .globl FUNC_NAME + .type FUNC_NAME,#function +FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ + srlx %o2, 31, %g2 + cmp %g2, 0 + tne %xcc, 5 + PREAMBLE + mov %o0, %o4 + cmp %o2, 0 + be,pn %XCC, 85f + or %o0, %o1, %o3 + cmp %o2, 16 + blu,a,pn %XCC, 80f + or %o3, %o2, %o3 + + cmp %o2, (5 * 64) + blu,pt %XCC, 70f + andcc %o3, 0x7, %g0 + + /* Clobbers o5/g1/g2/g3/g7/icc/xcc. */ + VISEntry + + /* Is 'dst' already aligned on an 64-byte boundary? */ + andcc %o0, 0x3f, %g2 + be,pt %XCC, 2f + + /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number + * of bytes to copy to make 'dst' 64-byte aligned. We pre- + * subtract this from 'len'. + */ + sub %o0, %o1, %GLOBAL_SPARE + sub %g2, 0x40, %g2 + sub %g0, %g2, %g2 + sub %o2, %g2, %o2 + andcc %g2, 0x7, %g1 + be,pt %icc, 2f + and %g2, 0x38, %g2 + +1: subcc %g1, 0x1, %g1 + EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) + EX_ST(STORE(stb, %o3, %o1 + %GLOBAL_SPARE)) + bgu,pt %XCC, 1b + add %o1, 0x1, %o1 + + add %o1, %GLOBAL_SPARE, %o0 + +2: cmp %g2, 0x0 + and %o1, 0x7, %g1 + be,pt %icc, 3f + alignaddr %o1, %g0, %o1 + + EX_LD(LOAD(ldd, %o1, %f4)) +1: EX_LD(LOAD(ldd, %o1 + 0x8, %f6)) + add %o1, 0x8, %o1 + subcc %g2, 0x8, %g2 + faligndata %f4, %f6, %f0 + EX_ST(STORE(std, %f0, %o0)) + be,pn %icc, 3f + add %o0, 0x8, %o0 + + EX_LD(LOAD(ldd, %o1 + 0x8, %f4)) + add %o1, 0x8, %o1 + subcc %g2, 0x8, %g2 + faligndata %f6, %f4, %f0 + EX_ST(STORE(std, %f0, %o0)) + bne,pt %icc, 1b + add %o0, 0x8, %o0 + + /* Destination is 64-byte aligned. */ +3: + membar #LoadStore | #StoreStore | #StoreLoad + + subcc %o2, 0x40, %GLOBAL_SPARE + add %o1, %g1, %g1 + andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE + srl %g1, 3, %g2 + sub %o2, %GLOBAL_SPARE, %g3 + andn %o1, (0x40 - 1), %o1 + and %g2, 7, %g2 + andncc %g3, 0x7, %g3 + fsrc2 %f0, %f2 + sub %g3, 0x8, %g3 + sub %o2, %GLOBAL_SPARE, %o2 + + add %g1, %GLOBAL_SPARE, %g1 + subcc %o2, %g3, %o2 + + EX_LD(LOAD_BLK(%o1, %f0)) + add %o1, 0x40, %o1 + add %g1, %g3, %g1 + EX_LD(LOAD_BLK(%o1, %f16)) + add %o1, 0x40, %o1 + sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE + EX_LD(LOAD_BLK(%o1, %f32)) + add %o1, 0x40, %o1 + + /* There are 8 instances of the unrolled loop, + * one for each possible alignment of the + * source buffer. Each loop instance is 452 + * bytes. + */ + sll %g2, 3, %o3 + sub %o3, %g2, %o3 + sllx %o3, 4, %o3 + add %o3, %g2, %o3 + sllx %o3, 2, %g2 +1: rd %pc, %o3 + add %o3, %lo(1f - 1b), %o3 + jmpl %o3 + %g2, %g0 + nop + + .align 64 +1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f0, %f2, %f48 +1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) + STORE_SYNC(o0, f48) + FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) + STORE_JUMP(o0, f48, 40f) +2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) + STORE_SYNC(o0, f48) + FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) + STORE_JUMP(o0, f48, 48f) +3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) + STORE_SYNC(o0, f48) + FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) + STORE_JUMP(o0, f48, 56f) + +1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f2, %f4, %f48 +1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) + STORE_SYNC(o0, f48) + FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) + STORE_JUMP(o0, f48, 41f) +2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) + STORE_SYNC(o0, f48) + FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) + STORE_JUMP(o0, f48, 49f) +3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) + STORE_SYNC(o0, f48) + FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) + STORE_JUMP(o0, f48, 57f) + +1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f4, %f6, %f48 +1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) + STORE_SYNC(o0, f48) + FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) + STORE_JUMP(o0, f48, 42f) +2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) + STORE_SYNC(o0, f48) + FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) + STORE_JUMP(o0, f48, 50f) +3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) + STORE_SYNC(o0, f48) + FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) + STORE_JUMP(o0, f48, 58f) + +1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f6, %f8, %f48 +1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) + STORE_SYNC(o0, f48) + FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) + STORE_JUMP(o0, f48, 43f) +2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) + STORE_SYNC(o0, f48) + FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) + STORE_JUMP(o0, f48, 51f) +3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) + STORE_SYNC(o0, f48) + FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) + STORE_JUMP(o0, f48, 59f) + +1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f8, %f10, %f48 +1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) + STORE_SYNC(o0, f48) + FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) + STORE_JUMP(o0, f48, 44f) +2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) + STORE_SYNC(o0, f48) + FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) + STORE_JUMP(o0, f48, 52f) +3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) + STORE_SYNC(o0, f48) + FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) + STORE_JUMP(o0, f48, 60f) + +1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f10, %f12, %f48 +1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) + STORE_SYNC(o0, f48) + FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) + STORE_JUMP(o0, f48, 45f) +2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) + STORE_SYNC(o0, f48) + FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) + STORE_JUMP(o0, f48, 53f) +3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) + STORE_SYNC(o0, f48) + FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) + STORE_JUMP(o0, f48, 61f) + +1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f12, %f14, %f48 +1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) + STORE_SYNC(o0, f48) + FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) + STORE_JUMP(o0, f48, 46f) +2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) + STORE_SYNC(o0, f48) + FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) + STORE_JUMP(o0, f48, 54f) +3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) + STORE_SYNC(o0, f48) + FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) + STORE_JUMP(o0, f48, 62f) + +1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) + LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) + FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) + LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) + FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) + LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) + ba,pt %xcc, 1b+4 + faligndata %f14, %f16, %f48 +1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) + STORE_SYNC(o0, f48) + FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) + STORE_JUMP(o0, f48, 47f) +2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) + STORE_SYNC(o0, f48) + FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) + STORE_JUMP(o0, f48, 55f) +3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) + STORE_SYNC(o0, f48) + FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) + STORE_JUMP(o0, f48, 63f) + +40: FINISH_VISCHUNK(o0, f0, f2, g3) +41: FINISH_VISCHUNK(o0, f2, f4, g3) +42: FINISH_VISCHUNK(o0, f4, f6, g3) +43: FINISH_VISCHUNK(o0, f6, f8, g3) +44: FINISH_VISCHUNK(o0, f8, f10, g3) +45: FINISH_VISCHUNK(o0, f10, f12, g3) +46: FINISH_VISCHUNK(o0, f12, f14, g3) +47: UNEVEN_VISCHUNK(o0, f14, f0, g3) +48: FINISH_VISCHUNK(o0, f16, f18, g3) +49: FINISH_VISCHUNK(o0, f18, f20, g3) +50: FINISH_VISCHUNK(o0, f20, f22, g3) +51: FINISH_VISCHUNK(o0, f22, f24, g3) +52: FINISH_VISCHUNK(o0, f24, f26, g3) +53: FINISH_VISCHUNK(o0, f26, f28, g3) +54: FINISH_VISCHUNK(o0, f28, f30, g3) +55: UNEVEN_VISCHUNK(o0, f30, f0, g3) +56: FINISH_VISCHUNK(o0, f32, f34, g3) +57: FINISH_VISCHUNK(o0, f34, f36, g3) +58: FINISH_VISCHUNK(o0, f36, f38, g3) +59: FINISH_VISCHUNK(o0, f38, f40, g3) +60: FINISH_VISCHUNK(o0, f40, f42, g3) +61: FINISH_VISCHUNK(o0, f42, f44, g3) +62: FINISH_VISCHUNK(o0, f44, f46, g3) +63: UNEVEN_VISCHUNK_LAST(o0, f46, f0, g3) + +93: EX_LD(LOAD(ldd, %o1, %f2)) + add %o1, 8, %o1 + subcc %g3, 8, %g3 + faligndata %f0, %f2, %f8 + EX_ST(STORE(std, %f8, %o0)) + bl,pn %xcc, 95f + add %o0, 8, %o0 + EX_LD(LOAD(ldd, %o1, %f0)) + add %o1, 8, %o1 + subcc %g3, 8, %g3 + faligndata %f2, %f0, %f8 + EX_ST(STORE(std, %f8, %o0)) + bge,pt %xcc, 93b + add %o0, 8, %o0 + +95: brz,pt %o2, 2f + mov %g1, %o1 + +1: EX_LD(LOAD(ldub, %o1, %o3)) + add %o1, 1, %o1 + subcc %o2, 1, %o2 + EX_ST(STORE(stb, %o3, %o0)) + bne,pt %xcc, 1b + add %o0, 1, %o0 + +2: membar #StoreLoad | #StoreStore + VISExit + retl + mov EX_RETVAL(%o4), %o0 + + .align 64 +70: /* 16 < len <= (5 * 64) */ + bne,pn %XCC, 75f + sub %o0, %o1, %o3 + +72: andn %o2, 0xf, %GLOBAL_SPARE + and %o2, 0xf, %o2 +1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) + EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) + subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE + EX_ST(STORE(stx, %o5, %o1 + %o3)) + add %o1, 0x8, %o1 + EX_ST(STORE(stx, %g1, %o1 + %o3)) + bgu,pt %XCC, 1b + add %o1, 0x8, %o1 +73: andcc %o2, 0x8, %g0 + be,pt %XCC, 1f + nop + EX_LD(LOAD(ldx, %o1, %o5)) + sub %o2, 0x8, %o2 + EX_ST(STORE(stx, %o5, %o1 + %o3)) + add %o1, 0x8, %o1 +1: andcc %o2, 0x4, %g0 + be,pt %XCC, 1f + nop + EX_LD(LOAD(lduw, %o1, %o5)) + sub %o2, 0x4, %o2 + EX_ST(STORE(stw, %o5, %o1 + %o3)) + add %o1, 0x4, %o1 +1: cmp %o2, 0 + be,pt %XCC, 85f + nop + ba,pt %xcc, 90f + nop + +75: andcc %o0, 0x7, %g1 + sub %g1, 0x8, %g1 + be,pn %icc, 2f + sub %g0, %g1, %g1 + sub %o2, %g1, %o2 + +1: EX_LD(LOAD(ldub, %o1, %o5)) + subcc %g1, 1, %g1 + EX_ST(STORE(stb, %o5, %o1 + %o3)) + bgu,pt %icc, 1b + add %o1, 1, %o1 + +2: add %o1, %o3, %o0 + andcc %o1, 0x7, %g1 + bne,pt %icc, 8f + sll %g1, 3, %g1 + + cmp %o2, 16 + bgeu,pt %icc, 72b + nop + ba,a,pt %xcc, 73b + +8: mov 64, %o3 + andn %o1, 0x7, %o1 + EX_LD(LOAD(ldx, %o1, %g2)) + sub %o3, %g1, %o3 + andn %o2, 0x7, %GLOBAL_SPARE + sllx %g2, %g1, %g2 +1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) + subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE + add %o1, 0x8, %o1 + srlx %g3, %o3, %o5 + or %o5, %g2, %o5 + EX_ST(STORE(stx, %o5, %o0)) + add %o0, 0x8, %o0 + bgu,pt %icc, 1b + sllx %g3, %g1, %g2 + + srl %g1, 3, %g1 + andcc %o2, 0x7, %o2 + be,pn %icc, 85f + add %o1, %g1, %o1 + ba,pt %xcc, 90f + sub %o0, %o1, %o3 + + .align 64 +80: /* 0 < len <= 16 */ + andcc %o3, 0x3, %g0 + bne,pn %XCC, 90f + sub %o0, %o1, %o3 + +1: EX_LD(LOAD(lduw, %o1, %g1)) + subcc %o2, 4, %o2 + EX_ST(STORE(stw, %g1, %o1 + %o3)) + bgu,pt %XCC, 1b + add %o1, 4, %o1 + +85: retl + mov EX_RETVAL(%o4), %o0 + + .align 32 +90: EX_LD(LOAD(ldub, %o1, %g1)) + subcc %o2, 1, %o2 + EX_ST(STORE(stb, %g1, %o1 + %o3)) + bgu,pt %XCC, 90b + add %o1, 1, %o1 + retl + mov EX_RETVAL(%o4), %o0 + + .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc/lib/U3copy_from_user.S b/arch/sparc/lib/U3copy_from_user.S new file mode 100644 index 00000000000..b1acd1331c3 --- /dev/null +++ b/arch/sparc/lib/U3copy_from_user.S @@ -0,0 +1,18 @@ +/* U3copy_from_user.S: UltraSparc-III optimized copy from userspace. + * + * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) + */ + +#define EX_LD(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one; \ + .text; \ + .align 4; + +#define FUNC_NAME U3copy_from_user +#define LOAD(type,addr,dest) type##a [addr] %asi, dest +#define EX_RETVAL(x) 0 + +#include "U3memcpy.S" diff --git a/arch/sparc/lib/U3copy_to_user.S b/arch/sparc/lib/U3copy_to_user.S new file mode 100644 index 00000000000..ef1e493afdf --- /dev/null +++ b/arch/sparc/lib/U3copy_to_user.S @@ -0,0 +1,29 @@ +/* U3copy_to_user.S: UltraSparc-III optimized copy to userspace. + * + * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) + */ + +#define EX_ST(x) \ +98: x; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one; \ + .text; \ + .align 4; + +#define FUNC_NAME U3copy_to_user +#define STORE(type,src,addr) type##a src, [addr] ASI_AIUS +#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_AIUS +#define EX_RETVAL(x) 0 + + /* Writing to %asi is _expensive_ so we hardcode it. + * Reading %asi to check for KERNEL_DS is comparatively + * cheap. + */ +#define PREAMBLE \ + rd %asi, %g1; \ + cmp %g1, ASI_AIUS; \ + bne,pn %icc, ___copy_in_user; \ + nop; \ + +#include "U3memcpy.S" diff --git a/arch/sparc/lib/U3memcpy.S b/arch/sparc/lib/U3memcpy.S new file mode 100644 index 00000000000..7cae9cc6a20 --- /dev/null +++ b/arch/sparc/lib/U3memcpy.S @@ -0,0 +1,422 @@ +/* U3memcpy.S: UltraSparc-III optimized memcpy. + * + * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) + */ + +#ifdef __KERNEL__ +#include <asm/visasm.h> +#include <asm/asi.h> +#define GLOBAL_SPARE %g7 +#else +#define ASI_BLK_P 0xf0 +#define FPRS_FEF 0x04 +#ifdef MEMCPY_DEBUG +#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ + clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#else +#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs +#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs +#endif +#define GLOBAL_SPARE %g5 +#endif + +#ifndef EX_LD +#define EX_LD(x) x +#endif + +#ifndef EX_ST +#define EX_ST(x) x +#endif + +#ifndef EX_RETVAL +#define EX_RETVAL(x) x +#endif + +#ifndef LOAD +#define LOAD(type,addr,dest) type [addr], dest +#endif + +#ifndef STORE +#define STORE(type,src,addr) type src, [addr] +#endif + +#ifndef STORE_BLK +#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P +#endif + +#ifndef FUNC_NAME +#define FUNC_NAME U3memcpy +#endif + +#ifndef PREAMBLE +#define PREAMBLE +#endif + +#ifndef XCC +#define XCC xcc +#endif + + .register %g2,#scratch + .register %g3,#scratch + + /* Special/non-trivial issues of this code: + * + * 1) %o5 is preserved from VISEntryHalf to VISExitHalf + * 2) Only low 32 FPU registers are used so that only the + * lower half of the FPU register set is dirtied by this + * code. This is especially important in the kernel. + * 3) This code never prefetches cachelines past the end + * of the source buffer. + */ + + .text + .align 64 + + /* The cheetah's flexible spine, oversized liver, enlarged heart, + * slender muscular body, and claws make it the swiftest hunter + * in Africa and the fastest animal on land. Can reach speeds + * of up to 2.4GB per second. + */ + + .globl FUNC_NAME + .type FUNC_NAME,#function +FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ + srlx %o2, 31, %g2 + cmp %g2, 0 + tne %xcc, 5 + PREAMBLE + mov %o0, %o4 + cmp %o2, 0 + be,pn %XCC, 85f + or %o0, %o1, %o3 + cmp %o2, 16 + blu,a,pn %XCC, 80f + or %o3, %o2, %o3 + + cmp %o2, (3 * 64) + blu,pt %XCC, 70f + andcc %o3, 0x7, %g0 + + /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve + * o5 from here until we hit VISExitHalf. + */ + VISEntryHalf + + /* Is 'dst' already aligned on an 64-byte boundary? */ + andcc %o0, 0x3f, %g2 + be,pt %XCC, 2f + + /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number + * of bytes to copy to make 'dst' 64-byte aligned. We pre- + * subtract this from 'len'. + */ + sub %o0, %o1, GLOBAL_SPARE + sub %g2, 0x40, %g2 + sub %g0, %g2, %g2 + sub %o2, %g2, %o2 + andcc %g2, 0x7, %g1 + be,pt %icc, 2f + and %g2, 0x38, %g2 + +1: subcc %g1, 0x1, %g1 + EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) + EX_ST(STORE(stb, %o3, %o1 + GLOBAL_SPARE)) + bgu,pt %XCC, 1b + add %o1, 0x1, %o1 + + add %o1, GLOBAL_SPARE, %o0 + +2: cmp %g2, 0x0 + and %o1, 0x7, %g1 + be,pt %icc, 3f + alignaddr %o1, %g0, %o1 + + EX_LD(LOAD(ldd, %o1, %f4)) +1: EX_LD(LOAD(ldd, %o1 + 0x8, %f6)) + add %o1, 0x8, %o1 + subcc %g2, 0x8, %g2 + faligndata %f4, %f6, %f0 + EX_ST(STORE(std, %f0, %o0)) + be,pn %icc, 3f + add %o0, 0x8, %o0 + + EX_LD(LOAD(ldd, %o1 + 0x8, %f4)) + add %o1, 0x8, %o1 + subcc %g2, 0x8, %g2 + faligndata %f6, %f4, %f2 + EX_ST(STORE(std, %f2, %o0)) + bne,pt %icc, 1b + add %o0, 0x8, %o0 + +3: LOAD(prefetch, %o1 + 0x000, #one_read) + LOAD(prefetch, %o1 + 0x040, #one_read) + andn %o2, (0x40 - 1), GLOBAL_SPARE + LOAD(prefetch, %o1 + 0x080, #one_read) + LOAD(prefetch, %o1 + 0x0c0, #one_read) + LOAD(prefetch, %o1 + 0x100, #one_read) + EX_LD(LOAD(ldd, %o1 + 0x000, %f0)) + LOAD(prefetch, %o1 + 0x140, #one_read) + EX_LD(LOAD(ldd, %o1 + 0x008, %f2)) + LOAD(prefetch, %o1 + 0x180, #one_read) + EX_LD(LOAD(ldd, %o1 + 0x010, %f4)) + LOAD(prefetch, %o1 + 0x1c0, #one_read) + faligndata %f0, %f2, %f16 + EX_LD(LOAD(ldd, %o1 + 0x018, %f6)) + faligndata %f2, %f4, %f18 + EX_LD(LOAD(ldd, %o1 + 0x020, %f8)) + faligndata %f4, %f6, %f20 + EX_LD(LOAD(ldd, %o1 + 0x028, %f10)) + faligndata %f6, %f8, %f22 + + EX_LD(LOAD(ldd, %o1 + 0x030, %f12)) + faligndata %f8, %f10, %f24 + EX_LD(LOAD(ldd, %o1 + 0x038, %f14)) + faligndata %f10, %f12, %f26 + EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) + + subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE + add %o1, 0x40, %o1 + bgu,pt %XCC, 1f + srl GLOBAL_SPARE, 6, %o3 + ba,pt %xcc, 2f + nop + + .align 64 +1: + EX_LD(LOAD(ldd, %o1 + 0x008, %f2)) + faligndata %f12, %f14, %f28 + EX_LD(LOAD(ldd, %o1 + 0x010, %f4)) + faligndata %f14, %f0, %f30 + EX_ST(STORE_BLK(%f16, %o0)) + EX_LD(LOAD(ldd, %o1 + 0x018, %f6)) + faligndata %f0, %f2, %f16 + add %o0, 0x40, %o0 + + EX_LD(LOAD(ldd, %o1 + 0x020, %f8)) + faligndata %f2, %f4, %f18 + EX_LD(LOAD(ldd, %o1 + 0x028, %f10)) + faligndata %f4, %f6, %f20 + EX_LD(LOAD(ldd, %o1 + 0x030, %f12)) + subcc %o3, 0x01, %o3 + faligndata %f6, %f8, %f22 + EX_LD(LOAD(ldd, %o1 + 0x038, %f14)) + + faligndata %f8, %f10, %f24 + EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) + LOAD(prefetch, %o1 + 0x1c0, #one_read) + faligndata %f10, %f12, %f26 + bg,pt %XCC, 1b + add %o1, 0x40, %o1 + + /* Finally we copy the last full 64-byte block. */ +2: + EX_LD(LOAD(ldd, %o1 + 0x008, %f2)) + faligndata %f12, %f14, %f28 + EX_LD(LOAD(ldd, %o1 + 0x010, %f4)) + faligndata %f14, %f0, %f30 + EX_ST(STORE_BLK(%f16, %o0)) + EX_LD(LOAD(ldd, %o1 + 0x018, %f6)) + faligndata %f0, %f2, %f16 + EX_LD(LOAD(ldd, %o1 + 0x020, %f8)) + faligndata %f2, %f4, %f18 + EX_LD(LOAD(ldd, %o1 + 0x028, %f10)) + faligndata %f4, %f6, %f20 + EX_LD(LOAD(ldd, %o1 + 0x030, %f12)) + faligndata %f6, %f8, %f22 + EX_LD(LOAD(ldd, %o1 + 0x038, %f14)) + faligndata %f8, %f10, %f24 + cmp %g1, 0 + be,pt %XCC, 1f + add %o0, 0x40, %o0 + EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) +1: faligndata %f10, %f12, %f26 + faligndata %f12, %f14, %f28 + faligndata %f14, %f0, %f30 + EX_ST(STORE_BLK(%f16, %o0)) + add %o0, 0x40, %o0 + add %o1, 0x40, %o1 + membar #Sync + + /* Now we copy the (len modulo 64) bytes at the end. + * Note how we borrow the %f0 loaded above. + * + * Also notice how this code is careful not to perform a + * load past the end of the src buffer. + */ + and %o2, 0x3f, %o2 + andcc %o2, 0x38, %g2 + be,pn %XCC, 2f + subcc %g2, 0x8, %g2 + be,pn %XCC, 2f + cmp %g1, 0 + + sub %o2, %g2, %o2 + be,a,pt %XCC, 1f + EX_LD(LOAD(ldd, %o1 + 0x00, %f0)) + +1: EX_LD(LOAD(ldd, %o1 + 0x08, %f2)) + add %o1, 0x8, %o1 + subcc %g2, 0x8, %g2 + faligndata %f0, %f2, %f8 + EX_ST(STORE(std, %f8, %o0)) + be,pn %XCC, 2f + add %o0, 0x8, %o0 + EX_LD(LOAD(ldd, %o1 + 0x08, %f0)) + add %o1, 0x8, %o1 + subcc %g2, 0x8, %g2 + faligndata %f2, %f0, %f8 + EX_ST(STORE(std, %f8, %o0)) + bne,pn %XCC, 1b + add %o0, 0x8, %o0 + + /* If anything is left, we copy it one byte at a time. + * Note that %g1 is (src & 0x3) saved above before the + * alignaddr was performed. + */ +2: + cmp %o2, 0 + add %o1, %g1, %o1 + VISExitHalf + be,pn %XCC, 85f + sub %o0, %o1, %o3 + + andcc %g1, 0x7, %g0 + bne,pn %icc, 90f + andcc %o2, 0x8, %g0 + be,pt %icc, 1f + nop + EX_LD(LOAD(ldx, %o1, %o5)) + EX_ST(STORE(stx, %o5, %o1 + %o3)) + add %o1, 0x8, %o1 + +1: andcc %o2, 0x4, %g0 + be,pt %icc, 1f + nop + EX_LD(LOAD(lduw, %o1, %o5)) + EX_ST(STORE(stw, %o5, %o1 + %o3)) + add %o1, 0x4, %o1 + +1: andcc %o2, 0x2, %g0 + be,pt %icc, 1f + nop + EX_LD(LOAD(lduh, %o1, %o5)) + EX_ST(STORE(sth, %o5, %o1 + %o3)) + add %o1, 0x2, %o1 + +1: andcc %o2, 0x1, %g0 + be,pt %icc, 85f + nop + EX_LD(LOAD(ldub, %o1, %o5)) + ba,pt %xcc, 85f + EX_ST(STORE(stb, %o5, %o1 + %o3)) + + .align 64 +70: /* 16 < len <= 64 */ + bne,pn %XCC, 75f + sub %o0, %o1, %o3 + +72: + andn %o2, 0xf, GLOBAL_SPARE + and %o2, 0xf, %o2 +1: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE + EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) + EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) + EX_ST(STORE(stx, %o5, %o1 + %o3)) + add %o1, 0x8, %o1 + EX_ST(STORE(stx, %g1, %o1 + %o3)) + bgu,pt %XCC, 1b + add %o1, 0x8, %o1 +73: andcc %o2, 0x8, %g0 + be,pt %XCC, 1f + nop + sub %o2, 0x8, %o2 + EX_LD(LOAD(ldx, %o1, %o5)) + EX_ST(STORE(stx, %o5, %o1 + %o3)) + add %o1, 0x8, %o1 +1: andcc %o2, 0x4, %g0 + be,pt %XCC, 1f + nop + sub %o2, 0x4, %o2 + EX_LD(LOAD(lduw, %o1, %o5)) + EX_ST(STORE(stw, %o5, %o1 + %o3)) + add %o1, 0x4, %o1 +1: cmp %o2, 0 + be,pt %XCC, 85f + nop + ba,pt %xcc, 90f + nop + +75: + andcc %o0, 0x7, %g1 + sub %g1, 0x8, %g1 + be,pn %icc, 2f + sub %g0, %g1, %g1 + sub %o2, %g1, %o2 + +1: subcc %g1, 1, %g1 + EX_LD(LOAD(ldub, %o1, %o5)) + EX_ST(STORE(stb, %o5, %o1 + %o3)) + bgu,pt %icc, 1b + add %o1, 1, %o1 + +2: add %o1, %o3, %o0 + andcc %o1, 0x7, %g1 + bne,pt %icc, 8f + sll %g1, 3, %g1 + + cmp %o2, 16 + bgeu,pt %icc, 72b + nop + ba,a,pt %xcc, 73b + +8: mov 64, %o3 + andn %o1, 0x7, %o1 + EX_LD(LOAD(ldx, %o1, %g2)) + sub %o3, %g1, %o3 + andn %o2, 0x7, GLOBAL_SPARE + sllx %g2, %g1, %g2 +1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) + subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE + add %o1, 0x8, %o1 + srlx %g3, %o3, %o5 + or %o5, %g2, %o5 + EX_ST(STORE(stx, %o5, %o0)) + add %o0, 0x8, %o0 + bgu,pt %icc, 1b + sllx %g3, %g1, %g2 + + srl %g1, 3, %g1 + andcc %o2, 0x7, %o2 + be,pn %icc, 85f + add %o1, %g1, %o1 + ba,pt %xcc, 90f + sub %o0, %o1, %o3 + + .align 64 +80: /* 0 < len <= 16 */ + andcc %o3, 0x3, %g0 + bne,pn %XCC, 90f + sub %o0, %o1, %o3 + +1: + subcc %o2, 4, %o2 + EX_LD(LOAD(lduw, %o1, %g1)) + EX_ST(STORE(stw, %g1, %o1 + %o3)) + bgu,pt %XCC, 1b + add %o1, 4, %o1 + +85: retl + mov EX_RETVAL(%o4), %o0 + + .align 32 +90: + subcc %o2, 1, %o2 + EX_LD(LOAD(ldub, %o1, %g1)) + EX_ST(STORE(stb, %g1, %o1 + %o3)) + bgu,pt %XCC, 90b + add %o1, 1, %o1 + retl + mov EX_RETVAL(%o4), %o0 + + .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc/lib/U3patch.S b/arch/sparc/lib/U3patch.S new file mode 100644 index 00000000000..ecc302619a6 --- /dev/null +++ b/arch/sparc/lib/U3patch.S @@ -0,0 +1,33 @@ +/* U3patch.S: Patch Ultra-I routines with Ultra-III variant. + * + * Copyright (C) 2004 David S. Miller <davem@redhat.com> + */ + +#define BRANCH_ALWAYS 0x10680000 +#define NOP 0x01000000 +#define ULTRA3_DO_PATCH(OLD, NEW) \ + sethi %hi(NEW), %g1; \ + or %g1, %lo(NEW), %g1; \ + sethi %hi(OLD), %g2; \ + or %g2, %lo(OLD), %g2; \ + sub %g1, %g2, %g1; \ + sethi %hi(BRANCH_ALWAYS), %g3; \ + sll %g1, 11, %g1; \ + srl %g1, 11 + 2, %g1; \ + or %g3, %lo(BRANCH_ALWAYS), %g3; \ + or %g3, %g1, %g3; \ + stw %g3, [%g2]; \ + sethi %hi(NOP), %g3; \ + or %g3, %lo(NOP), %g3; \ + stw %g3, [%g2 + 0x4]; \ + flush %g2; + + .globl cheetah_patch_copyops + .type cheetah_patch_copyops,#function +cheetah_patch_copyops: + ULTRA3_DO_PATCH(memcpy, U3memcpy) + ULTRA3_DO_PATCH(___copy_from_user, U3copy_from_user) + ULTRA3_DO_PATCH(___copy_to_user, U3copy_to_user) + retl + nop + .size cheetah_patch_copyops,.-cheetah_patch_copyops diff --git a/arch/sparc/lib/VISsave.S b/arch/sparc/lib/VISsave.S new file mode 100644 index 00000000000..b320ae9e2e2 --- /dev/null +++ b/arch/sparc/lib/VISsave.S @@ -0,0 +1,144 @@ +/* + * VISsave.S: Code for saving FPU register state for + * VIS routines. One should not call this directly, + * but use macros provided in <asm/visasm.h>. + * + * Copyright (C) 1998 Jakub Jelinek (jj@ultra.linux.cz) + */ + +#include <asm/asi.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/visasm.h> +#include <asm/thread_info.h> + + .text + .globl VISenter, VISenterhalf + + /* On entry: %o5=current FPRS value, %g7 is callers address */ + /* May clobber %o5, %g1, %g2, %g3, %g7, %icc, %xcc */ + + /* Nothing special need be done here to handle pre-emption, this + * FPU save/restore mechanism is already preemption safe. + */ + + .align 32 +VISenter: + ldub [%g6 + TI_FPDEPTH], %g1 + brnz,a,pn %g1, 1f + cmp %g1, 1 + stb %g0, [%g6 + TI_FPSAVED] + stx %fsr, [%g6 + TI_XFSR] +9: jmpl %g7 + %g0, %g0 + nop +1: bne,pn %icc, 2f + + srl %g1, 1, %g1 +vis1: ldub [%g6 + TI_FPSAVED], %g3 + stx %fsr, [%g6 + TI_XFSR] + or %g3, %o5, %g3 + stb %g3, [%g6 + TI_FPSAVED] + rd %gsr, %g3 + clr %g1 + ba,pt %xcc, 3f + + stx %g3, [%g6 + TI_GSR] +2: add %g6, %g1, %g3 + cmp %o5, FPRS_DU + be,pn %icc, 6f + sll %g1, 3, %g1 + stb %o5, [%g3 + TI_FPSAVED] + rd %gsr, %g2 + add %g6, %g1, %g3 + stx %g2, [%g3 + TI_GSR] + + add %g6, %g1, %g2 + stx %fsr, [%g2 + TI_XFSR] + sll %g1, 5, %g1 +3: andcc %o5, FPRS_DL|FPRS_DU, %g0 + be,pn %icc, 9b + add %g6, TI_FPREGS, %g2 + andcc %o5, FPRS_DL, %g0 + + be,pn %icc, 4f + add %g6, TI_FPREGS+0x40, %g3 + membar #Sync + stda %f0, [%g2 + %g1] ASI_BLK_P + stda %f16, [%g3 + %g1] ASI_BLK_P + membar #Sync + andcc %o5, FPRS_DU, %g0 + be,pn %icc, 5f +4: add %g1, 128, %g1 + membar #Sync + stda %f32, [%g2 + %g1] ASI_BLK_P + + stda %f48, [%g3 + %g1] ASI_BLK_P +5: membar #Sync + ba,pt %xcc, 80f + nop + + .align 32 +80: jmpl %g7 + %g0, %g0 + nop + +6: ldub [%g3 + TI_FPSAVED], %o5 + or %o5, FPRS_DU, %o5 + add %g6, TI_FPREGS+0x80, %g2 + stb %o5, [%g3 + TI_FPSAVED] + + sll %g1, 5, %g1 + add %g6, TI_FPREGS+0xc0, %g3 + wr %g0, FPRS_FEF, %fprs + membar #Sync + stda %f32, [%g2 + %g1] ASI_BLK_P + stda %f48, [%g3 + %g1] ASI_BLK_P + membar #Sync + ba,pt %xcc, 80f + nop + + .align 32 +80: jmpl %g7 + %g0, %g0 + nop + + .align 32 +VISenterhalf: + ldub [%g6 + TI_FPDEPTH], %g1 + brnz,a,pn %g1, 1f + cmp %g1, 1 + stb %g0, [%g6 + TI_FPSAVED] + stx %fsr, [%g6 + TI_XFSR] + clr %o5 + jmpl %g7 + %g0, %g0 + wr %g0, FPRS_FEF, %fprs + +1: bne,pn %icc, 2f + srl %g1, 1, %g1 + ba,pt %xcc, vis1 + sub %g7, 8, %g7 +2: addcc %g6, %g1, %g3 + sll %g1, 3, %g1 + andn %o5, FPRS_DU, %g2 + stb %g2, [%g3 + TI_FPSAVED] + + rd %gsr, %g2 + add %g6, %g1, %g3 + stx %g2, [%g3 + TI_GSR] + add %g6, %g1, %g2 + stx %fsr, [%g2 + TI_XFSR] + sll %g1, 5, %g1 +3: andcc %o5, FPRS_DL, %g0 + be,pn %icc, 4f + add %g6, TI_FPREGS, %g2 + + add %g6, TI_FPREGS+0x40, %g3 + membar #Sync + stda %f0, [%g2 + %g1] ASI_BLK_P + stda %f16, [%g3 + %g1] ASI_BLK_P + membar #Sync + ba,pt %xcc, 4f + nop + + .align 32 +4: and %o5, FPRS_DU, %o5 + jmpl %g7 + %g0, %g0 + wr %o5, FPRS_FEF, %fprs diff --git a/arch/sparc/lib/ashldi3.S b/arch/sparc/lib/ashldi3.S index 52418a0cb3d..86f60de07b0 100644 --- a/arch/sparc/lib/ashldi3.S +++ b/arch/sparc/lib/ashldi3.S @@ -1,14 +1,14 @@ -/* $Id: ashldi3.S,v 1.2 1999/11/19 04:11:46 davem Exp $ +/* * ashldi3.S: GCC emits these for certain drivers playing * with long longs. * * Copyright (C) 1999 David S. Miller (davem@redhat.com) */ +#include <linux/linkage.h> + .text - .align 4 - .globl __ashldi3 -__ashldi3: +ENTRY(__ashldi3) cmp %o2, 0 be 9f mov 0x20, %g2 @@ -32,3 +32,4 @@ __ashldi3: 9: retl nop +ENDPROC(__ashldi3) diff --git a/arch/sparc/lib/ashrdi3.S b/arch/sparc/lib/ashrdi3.S index 2848237598a..6eb8ba2dd50 100644 --- a/arch/sparc/lib/ashrdi3.S +++ b/arch/sparc/lib/ashrdi3.S @@ -1,14 +1,14 @@ -/* $Id: ashrdi3.S,v 1.4 1999/11/19 04:11:49 davem Exp $ +/* * ashrdi3.S: The filesystem code creates all kinds of references to * this little routine on the sparc with gcc. * * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) */ +#include <linux/linkage.h> + .text - .align 4 - .globl __ashrdi3 -__ashrdi3: +ENTRY(__ashrdi3) tst %o2 be 3f or %g0, 32, %g2 @@ -34,3 +34,4 @@ __ashrdi3: 3: jmpl %o7 + 8, %g0 nop +ENDPROC(__ashrdi3) diff --git a/arch/sparc/lib/atomic.S b/arch/sparc/lib/atomic.S deleted file mode 100644 index 178cbb8ae1b..00000000000 --- a/arch/sparc/lib/atomic.S +++ /dev/null @@ -1,99 +0,0 @@ -/* atomic.S: Move this stuff here for better ICACHE hit rates. - * - * Copyright (C) 1996 David S. Miller (davem@caipfs.rutgers.edu) - */ - -#include <asm/ptrace.h> -#include <asm/psr.h> - - .text - .align 4 - - .globl __atomic_begin -__atomic_begin: - -#ifndef CONFIG_SMP - .globl ___xchg32_sun4c -___xchg32_sun4c: - rd %psr, %g3 - andcc %g3, PSR_PIL, %g0 - bne 1f - nop - wr %g3, PSR_PIL, %psr - nop; nop; nop -1: - andcc %g3, PSR_PIL, %g0 - ld [%g1], %g7 - bne 1f - st %g2, [%g1] - wr %g3, 0x0, %psr - nop; nop; nop -1: - mov %g7, %g2 - jmpl %o7 + 8, %g0 - mov %g4, %o7 - - .globl ___xchg32_sun4md -___xchg32_sun4md: - swap [%g1], %g2 - jmpl %o7 + 8, %g0 - mov %g4, %o7 -#endif - - /* Read asm-sparc/atomic.h carefully to understand how this works for SMP. - * Really, some things here for SMP are overly clever, go read the header. - */ - .globl ___atomic24_add -___atomic24_add: - rd %psr, %g3 ! Keep the code small, old way was stupid - nop; nop; nop; ! Let the bits set - or %g3, PSR_PIL, %g7 ! Disable interrupts - wr %g7, 0x0, %psr ! Set %psr - nop; nop; nop; ! Let the bits set -#ifdef CONFIG_SMP -1: ldstub [%g1 + 3], %g7 ! Spin on the byte lock for SMP. - orcc %g7, 0x0, %g0 ! Did we get it? - bne 1b ! Nope... - ld [%g1], %g7 ! Load locked atomic24_t - sra %g7, 8, %g7 ! Get signed 24-bit integer - add %g7, %g2, %g2 ! Add in argument - sll %g2, 8, %g7 ! Transpose back to atomic24_t - st %g7, [%g1] ! Clever: This releases the lock as well. -#else - ld [%g1], %g7 ! Load locked atomic24_t - add %g7, %g2, %g2 ! Add in argument - st %g2, [%g1] ! Store it back -#endif - wr %g3, 0x0, %psr ! Restore original PSR_PIL - nop; nop; nop; ! Let the bits set - jmpl %o7, %g0 ! NOTE: not + 8, see callers in atomic.h - mov %g4, %o7 ! Restore %o7 - - .globl ___atomic24_sub -___atomic24_sub: - rd %psr, %g3 ! Keep the code small, old way was stupid - nop; nop; nop; ! Let the bits set - or %g3, PSR_PIL, %g7 ! Disable interrupts - wr %g7, 0x0, %psr ! Set %psr - nop; nop; nop; ! Let the bits set -#ifdef CONFIG_SMP -1: ldstub [%g1 + 3], %g7 ! Spin on the byte lock for SMP. - orcc %g7, 0x0, %g0 ! Did we get it? - bne 1b ! Nope... - ld [%g1], %g7 ! Load locked atomic24_t - sra %g7, 8, %g7 ! Get signed 24-bit integer - sub %g7, %g2, %g2 ! Subtract argument - sll %g2, 8, %g7 ! Transpose back to atomic24_t - st %g7, [%g1] ! Clever: This releases the lock as well -#else - ld [%g1], %g7 ! Load locked atomic24_t - sub %g7, %g2, %g2 ! Subtract argument - st %g2, [%g1] ! Store it back -#endif - wr %g3, 0x0, %psr ! Restore original PSR_PIL - nop; nop; nop; ! Let the bits set - jmpl %o7, %g0 ! NOTE: not + 8, see callers in atomic.h - mov %g4, %o7 ! Restore %o7 - - .globl __atomic_end -__atomic_end: diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c index cbddeb38ffd..1d32b54089a 100644 --- a/arch/sparc/lib/atomic32.c +++ b/arch/sparc/lib/atomic32.c @@ -7,7 +7,7 @@ * Based on asm-parisc/atomic.h Copyright (C) 2000 Philipp Rumpf */ -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/spinlock.h> #include <linux/module.h> @@ -16,7 +16,7 @@ #define ATOMIC_HASH(a) (&__atomic_hash[(((unsigned long)a)>>8) & (ATOMIC_HASH_SIZE-1)]) spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] = { - [0 ... (ATOMIC_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED + [0 ... (ATOMIC_HASH_SIZE-1)] = __SPIN_LOCK_UNLOCKED(__atomic_hash) }; #else /* SMP */ @@ -55,7 +55,7 @@ int atomic_cmpxchg(atomic_t *v, int old, int new) } EXPORT_SYMBOL(atomic_cmpxchg); -int atomic_add_unless(atomic_t *v, int a, int u) +int __atomic_add_unless(atomic_t *v, int a, int u) { int ret; unsigned long flags; @@ -65,9 +65,9 @@ int atomic_add_unless(atomic_t *v, int a, int u) if (ret != u) v->counter += a; spin_unlock_irqrestore(ATOMIC_HASH(v), flags); - return ret != u; + return ret; } -EXPORT_SYMBOL(atomic_add_unless); +EXPORT_SYMBOL(__atomic_add_unless); /* Atomic operations are already serializing */ void atomic_set(atomic_t *v, int i) diff --git a/arch/sparc/lib/atomic_64.S b/arch/sparc/lib/atomic_64.S new file mode 100644 index 00000000000..85c233d0a34 --- /dev/null +++ b/arch/sparc/lib/atomic_64.S @@ -0,0 +1,133 @@ +/* atomic.S: These things are too big to do inline. + * + * Copyright (C) 1999, 2007 2012 David S. Miller (davem@davemloft.net) + */ + +#include <linux/linkage.h> +#include <asm/asi.h> +#include <asm/backoff.h> + + .text + + /* Two versions of the atomic routines, one that + * does not return a value and does not perform + * memory barriers, and a second which returns + * a value and does the barriers. + */ +ENTRY(atomic_add) /* %o0 = increment, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) +1: lduw [%o1], %g1 + add %g1, %o0, %g7 + cas [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, BACKOFF_LABEL(2f, 1b) + nop + retl + nop +2: BACKOFF_SPIN(%o2, %o3, 1b) +ENDPROC(atomic_add) + +ENTRY(atomic_sub) /* %o0 = decrement, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) +1: lduw [%o1], %g1 + sub %g1, %o0, %g7 + cas [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, BACKOFF_LABEL(2f, 1b) + nop + retl + nop +2: BACKOFF_SPIN(%o2, %o3, 1b) +ENDPROC(atomic_sub) + +ENTRY(atomic_add_ret) /* %o0 = increment, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) +1: lduw [%o1], %g1 + add %g1, %o0, %g7 + cas [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, BACKOFF_LABEL(2f, 1b) + add %g1, %o0, %g1 + retl + sra %g1, 0, %o0 +2: BACKOFF_SPIN(%o2, %o3, 1b) +ENDPROC(atomic_add_ret) + +ENTRY(atomic_sub_ret) /* %o0 = decrement, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) +1: lduw [%o1], %g1 + sub %g1, %o0, %g7 + cas [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %icc, BACKOFF_LABEL(2f, 1b) + sub %g1, %o0, %g1 + retl + sra %g1, 0, %o0 +2: BACKOFF_SPIN(%o2, %o3, 1b) +ENDPROC(atomic_sub_ret) + +ENTRY(atomic64_add) /* %o0 = increment, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) +1: ldx [%o1], %g1 + add %g1, %o0, %g7 + casx [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %xcc, BACKOFF_LABEL(2f, 1b) + nop + retl + nop +2: BACKOFF_SPIN(%o2, %o3, 1b) +ENDPROC(atomic64_add) + +ENTRY(atomic64_sub) /* %o0 = decrement, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) +1: ldx [%o1], %g1 + sub %g1, %o0, %g7 + casx [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %xcc, BACKOFF_LABEL(2f, 1b) + nop + retl + nop +2: BACKOFF_SPIN(%o2, %o3, 1b) +ENDPROC(atomic64_sub) + +ENTRY(atomic64_add_ret) /* %o0 = increment, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) +1: ldx [%o1], %g1 + add %g1, %o0, %g7 + casx [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %xcc, BACKOFF_LABEL(2f, 1b) + nop + retl + add %g1, %o0, %o0 +2: BACKOFF_SPIN(%o2, %o3, 1b) +ENDPROC(atomic64_add_ret) + +ENTRY(atomic64_sub_ret) /* %o0 = decrement, %o1 = atomic_ptr */ + BACKOFF_SETUP(%o2) +1: ldx [%o1], %g1 + sub %g1, %o0, %g7 + casx [%o1], %g1, %g7 + cmp %g1, %g7 + bne,pn %xcc, BACKOFF_LABEL(2f, 1b) + nop + retl + sub %g1, %o0, %o0 +2: BACKOFF_SPIN(%o2, %o3, 1b) +ENDPROC(atomic64_sub_ret) + +ENTRY(atomic64_dec_if_positive) /* %o0 = atomic_ptr */ + BACKOFF_SETUP(%o2) +1: ldx [%o0], %g1 + brlez,pn %g1, 3f + sub %g1, 1, %g7 + casx [%o0], %g1, %g7 + cmp %g1, %g7 + bne,pn %xcc, BACKOFF_LABEL(2f, 1b) + nop +3: retl + sub %g1, 1, %o0 +2: BACKOFF_SPIN(%o2, %o3, 1b) +ENDPROC(atomic64_dec_if_positive) diff --git a/arch/sparc/lib/bitext.c b/arch/sparc/lib/bitext.c index 764b3eb7b60..8ec4e9c0251 100644 --- a/arch/sparc/lib/bitext.c +++ b/arch/sparc/lib/bitext.c @@ -10,7 +10,7 @@ */ #include <linux/string.h> -#include <linux/bitops.h> +#include <linux/bitmap.h> #include <asm/bitext.h> @@ -80,8 +80,7 @@ int bit_map_string_get(struct bit_map *t, int len, int align) while (test_bit(offset + i, t->map) == 0) { i++; if (i == len) { - for (i = 0; i < len; i++) - __set_bit(offset + i, t->map); + bitmap_set(t->map, offset, len); if (offset == t->first_free) t->first_free = find_next_zero_bit (t->map, t->size, @@ -120,11 +119,7 @@ void bit_map_clear(struct bit_map *t, int offset, int len) void bit_map_init(struct bit_map *t, unsigned long *map, int size) { - - if ((size & 07) != 0) - BUG(); - memset(map, 0, size>>3); - + bitmap_zero(map, size); memset(t, 0, sizeof *t); spin_lock_init(&t->lock); t->map = map; diff --git a/arch/sparc/lib/bitops.S b/arch/sparc/lib/bitops.S new file mode 100644 index 00000000000..36f72cc0e67 --- /dev/null +++ b/arch/sparc/lib/bitops.S @@ -0,0 +1,130 @@ +/* bitops.S: Sparc64 atomic bit operations. + * + * Copyright (C) 2000, 2007 David S. Miller (davem@davemloft.net) + */ + +#include <linux/linkage.h> +#include <asm/asi.h> +#include <asm/backoff.h> + + .text + +ENTRY(test_and_set_bit) /* %o0=nr, %o1=addr */ + BACKOFF_SETUP(%o3) + srlx %o0, 6, %g1 + mov 1, %o2 + sllx %g1, 3, %g3 + and %o0, 63, %g2 + sllx %o2, %g2, %o2 + add %o1, %g3, %o1 +1: ldx [%o1], %g7 + or %g7, %o2, %g1 + casx [%o1], %g7, %g1 + cmp %g7, %g1 + bne,pn %xcc, BACKOFF_LABEL(2f, 1b) + and %g7, %o2, %g2 + clr %o0 + movrne %g2, 1, %o0 + retl + nop +2: BACKOFF_SPIN(%o3, %o4, 1b) +ENDPROC(test_and_set_bit) + +ENTRY(test_and_clear_bit) /* %o0=nr, %o1=addr */ + BACKOFF_SETUP(%o3) + srlx %o0, 6, %g1 + mov 1, %o2 + sllx %g1, 3, %g3 + and %o0, 63, %g2 + sllx %o2, %g2, %o2 + add %o1, %g3, %o1 +1: ldx [%o1], %g7 + andn %g7, %o2, %g1 + casx [%o1], %g7, %g1 + cmp %g7, %g1 + bne,pn %xcc, BACKOFF_LABEL(2f, 1b) + and %g7, %o2, %g2 + clr %o0 + movrne %g2, 1, %o0 + retl + nop +2: BACKOFF_SPIN(%o3, %o4, 1b) +ENDPROC(test_and_clear_bit) + +ENTRY(test_and_change_bit) /* %o0=nr, %o1=addr */ + BACKOFF_SETUP(%o3) + srlx %o0, 6, %g1 + mov 1, %o2 + sllx %g1, 3, %g3 + and %o0, 63, %g2 + sllx %o2, %g2, %o2 + add %o1, %g3, %o1 +1: ldx [%o1], %g7 + xor %g7, %o2, %g1 + casx [%o1], %g7, %g1 + cmp %g7, %g1 + bne,pn %xcc, BACKOFF_LABEL(2f, 1b) + and %g7, %o2, %g2 + clr %o0 + movrne %g2, 1, %o0 + retl + nop +2: BACKOFF_SPIN(%o3, %o4, 1b) +ENDPROC(test_and_change_bit) + +ENTRY(set_bit) /* %o0=nr, %o1=addr */ + BACKOFF_SETUP(%o3) + srlx %o0, 6, %g1 + mov 1, %o2 + sllx %g1, 3, %g3 + and %o0, 63, %g2 + sllx %o2, %g2, %o2 + add %o1, %g3, %o1 +1: ldx [%o1], %g7 + or %g7, %o2, %g1 + casx [%o1], %g7, %g1 + cmp %g7, %g1 + bne,pn %xcc, BACKOFF_LABEL(2f, 1b) + nop + retl + nop +2: BACKOFF_SPIN(%o3, %o4, 1b) +ENDPROC(set_bit) + +ENTRY(clear_bit) /* %o0=nr, %o1=addr */ + BACKOFF_SETUP(%o3) + srlx %o0, 6, %g1 + mov 1, %o2 + sllx %g1, 3, %g3 + and %o0, 63, %g2 + sllx %o2, %g2, %o2 + add %o1, %g3, %o1 +1: ldx [%o1], %g7 + andn %g7, %o2, %g1 + casx [%o1], %g7, %g1 + cmp %g7, %g1 + bne,pn %xcc, BACKOFF_LABEL(2f, 1b) + nop + retl + nop +2: BACKOFF_SPIN(%o3, %o4, 1b) +ENDPROC(clear_bit) + +ENTRY(change_bit) /* %o0=nr, %o1=addr */ + BACKOFF_SETUP(%o3) + srlx %o0, 6, %g1 + mov 1, %o2 + sllx %g1, 3, %g3 + and %o0, 63, %g2 + sllx %o2, %g2, %o2 + add %o1, %g3, %o1 +1: ldx [%o1], %g7 + xor %g7, %o2, %g1 + casx [%o1], %g7, %g1 + cmp %g7, %g1 + bne,pn %xcc, BACKOFF_LABEL(2f, 1b) + nop + retl + nop +2: BACKOFF_SPIN(%o3, %o4, 1b) +ENDPROC(change_bit) diff --git a/arch/sparc/lib/blockops.S b/arch/sparc/lib/blockops.S index a7c7ffaa4a9..3c771011ff4 100644 --- a/arch/sparc/lib/blockops.S +++ b/arch/sparc/lib/blockops.S @@ -1,9 +1,10 @@ -/* $Id: blockops.S,v 1.8 1998/01/30 10:58:44 jj Exp $ +/* * blockops.S: Common block zero optimized routines. * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) */ +#include <linux/linkage.h> #include <asm/page.h> /* Zero out 64 bytes of memory at (buf + offset). @@ -44,10 +45,7 @@ */ .text - .align 4 - .globl bzero_1page, __copy_1page - -bzero_1page: +ENTRY(bzero_1page) /* NOTE: If you change the number of insns of this routine, please check * arch/sparc/mm/hypersparc.S */ /* %o0 = buf */ @@ -65,8 +63,9 @@ bzero_1page: retl nop +ENDPROC(bzero_1page) -__copy_1page: +ENTRY(__copy_1page) /* NOTE: If you change the number of insns of this routine, please check * arch/sparc/mm/hypersparc.S */ /* %o0 = dst, %o1 = src */ @@ -87,3 +86,4 @@ __copy_1page: retl nop +ENDPROC(__copy_1page) diff --git a/arch/sparc/lib/bzero.S b/arch/sparc/lib/bzero.S new file mode 100644 index 00000000000..8c058114b64 --- /dev/null +++ b/arch/sparc/lib/bzero.S @@ -0,0 +1,145 @@ +/* bzero.S: Simple prefetching memset, bzero, and clear_user + * implementations. + * + * Copyright (C) 2005 David S. Miller <davem@davemloft.net> + */ + +#include <linux/linkage.h> + + .text + +ENTRY(memset) /* %o0=buf, %o1=pat, %o2=len */ + and %o1, 0xff, %o3 + mov %o2, %o1 + sllx %o3, 8, %g1 + or %g1, %o3, %o2 + sllx %o2, 16, %g1 + or %g1, %o2, %o2 + sllx %o2, 32, %g1 + ba,pt %xcc, 1f + or %g1, %o2, %o2 + +ENTRY(__bzero) /* %o0=buf, %o1=len */ + clr %o2 +1: mov %o0, %o3 + brz,pn %o1, __bzero_done + cmp %o1, 16 + bl,pn %icc, __bzero_tiny + prefetch [%o0 + 0x000], #n_writes + andcc %o0, 0x3, %g0 + be,pt %icc, 2f +1: stb %o2, [%o0 + 0x00] + add %o0, 1, %o0 + andcc %o0, 0x3, %g0 + bne,pn %icc, 1b + sub %o1, 1, %o1 +2: andcc %o0, 0x7, %g0 + be,pt %icc, 3f + stw %o2, [%o0 + 0x00] + sub %o1, 4, %o1 + add %o0, 4, %o0 +3: and %o1, 0x38, %g1 + cmp %o1, 0x40 + andn %o1, 0x3f, %o4 + bl,pn %icc, 5f + and %o1, 0x7, %o1 + prefetch [%o0 + 0x040], #n_writes + prefetch [%o0 + 0x080], #n_writes + prefetch [%o0 + 0x0c0], #n_writes + prefetch [%o0 + 0x100], #n_writes + prefetch [%o0 + 0x140], #n_writes +4: prefetch [%o0 + 0x180], #n_writes + stx %o2, [%o0 + 0x00] + stx %o2, [%o0 + 0x08] + stx %o2, [%o0 + 0x10] + stx %o2, [%o0 + 0x18] + stx %o2, [%o0 + 0x20] + stx %o2, [%o0 + 0x28] + stx %o2, [%o0 + 0x30] + stx %o2, [%o0 + 0x38] + subcc %o4, 0x40, %o4 + bne,pt %icc, 4b + add %o0, 0x40, %o0 + brz,pn %g1, 6f + nop +5: stx %o2, [%o0 + 0x00] + subcc %g1, 8, %g1 + bne,pt %icc, 5b + add %o0, 0x8, %o0 +6: brz,pt %o1, __bzero_done + nop +__bzero_tiny: +1: stb %o2, [%o0 + 0x00] + subcc %o1, 1, %o1 + bne,pt %icc, 1b + add %o0, 1, %o0 +__bzero_done: + retl + mov %o3, %o0 +ENDPROC(__bzero) +ENDPROC(memset) + +#define EX_ST(x,y) \ +98: x,y; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_o1; \ + .text; \ + .align 4; + +ENTRY(__clear_user) /* %o0=buf, %o1=len */ + brz,pn %o1, __clear_user_done + cmp %o1, 16 + bl,pn %icc, __clear_user_tiny + EX_ST(prefetcha [%o0 + 0x00] %asi, #n_writes) + andcc %o0, 0x3, %g0 + be,pt %icc, 2f +1: EX_ST(stba %g0, [%o0 + 0x00] %asi) + add %o0, 1, %o0 + andcc %o0, 0x3, %g0 + bne,pn %icc, 1b + sub %o1, 1, %o1 +2: andcc %o0, 0x7, %g0 + be,pt %icc, 3f + EX_ST(stwa %g0, [%o0 + 0x00] %asi) + sub %o1, 4, %o1 + add %o0, 4, %o0 +3: and %o1, 0x38, %g1 + cmp %o1, 0x40 + andn %o1, 0x3f, %o4 + bl,pn %icc, 5f + and %o1, 0x7, %o1 + EX_ST(prefetcha [%o0 + 0x040] %asi, #n_writes) + EX_ST(prefetcha [%o0 + 0x080] %asi, #n_writes) + EX_ST(prefetcha [%o0 + 0x0c0] %asi, #n_writes) + EX_ST(prefetcha [%o0 + 0x100] %asi, #n_writes) + EX_ST(prefetcha [%o0 + 0x140] %asi, #n_writes) +4: EX_ST(prefetcha [%o0 + 0x180] %asi, #n_writes) + EX_ST(stxa %g0, [%o0 + 0x00] %asi) + EX_ST(stxa %g0, [%o0 + 0x08] %asi) + EX_ST(stxa %g0, [%o0 + 0x10] %asi) + EX_ST(stxa %g0, [%o0 + 0x18] %asi) + EX_ST(stxa %g0, [%o0 + 0x20] %asi) + EX_ST(stxa %g0, [%o0 + 0x28] %asi) + EX_ST(stxa %g0, [%o0 + 0x30] %asi) + EX_ST(stxa %g0, [%o0 + 0x38] %asi) + subcc %o4, 0x40, %o4 + bne,pt %icc, 4b + add %o0, 0x40, %o0 + brz,pn %g1, 6f + nop +5: EX_ST(stxa %g0, [%o0 + 0x00] %asi) + subcc %g1, 8, %g1 + bne,pt %icc, 5b + add %o0, 0x8, %o0 +6: brz,pt %o1, __clear_user_done + nop +__clear_user_tiny: +1: EX_ST(stba %g0, [%o0 + 0x00] %asi) + subcc %o1, 1, %o1 + bne,pt %icc, 1b + add %o0, 1, %o0 +__clear_user_done: + retl + clr %o0 +ENDPROC(__clear_user) diff --git a/arch/sparc/lib/checksum.S b/arch/sparc/lib/checksum_32.S index 77f228533d4..0084c3361e1 100644 --- a/arch/sparc/lib/checksum.S +++ b/arch/sparc/lib/checksum_32.S @@ -289,10 +289,16 @@ cc_end_cruft: /* Also, handle the alignment code out of band. */ cc_dword_align: - cmp %g1, 6 - bl,a ccte + cmp %g1, 16 + bge 1f + srl %g1, 1, %o3 +2: cmp %o3, 0 + be,a ccte andcc %g1, 0xf, %o3 - andcc %o0, 0x1, %g0 + andcc %o3, %o0, %g0 ! Check %o0 only (%o1 has the same last 2 bits) + be,a 2b + srl %o3, 1, %o3 +1: andcc %o0, 0x1, %g0 bne ccslow andcc %o0, 0x2, %g0 be 1f @@ -560,7 +566,7 @@ __csum_partial_copy_end: mov %i0, %o1 mov %i1, %o0 5: - call __memcpy + call memcpy mov %i2, %o2 tst %o0 bne,a 2f diff --git a/arch/sparc/lib/checksum_64.S b/arch/sparc/lib/checksum_64.S new file mode 100644 index 00000000000..1d230f693dc --- /dev/null +++ b/arch/sparc/lib/checksum_64.S @@ -0,0 +1,173 @@ +/* checksum.S: Sparc V9 optimized checksum code. + * + * Copyright(C) 1995 Linus Torvalds + * Copyright(C) 1995 Miguel de Icaza + * Copyright(C) 1996, 2000 David S. Miller + * Copyright(C) 1997 Jakub Jelinek + * + * derived from: + * Linux/Alpha checksum c-code + * Linux/ix86 inline checksum assembly + * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) + * David Mosberger-Tang for optimized reference c-code + * BSD4.4 portable checksum routine + */ + + .text + +csum_partial_fix_alignment: + /* We checked for zero length already, so there must be + * at least one byte. + */ + be,pt %icc, 1f + nop + ldub [%o0 + 0x00], %o4 + add %o0, 1, %o0 + sub %o1, 1, %o1 +1: andcc %o0, 0x2, %g0 + be,pn %icc, csum_partial_post_align + cmp %o1, 2 + blu,pn %icc, csum_partial_end_cruft + nop + lduh [%o0 + 0x00], %o5 + add %o0, 2, %o0 + sub %o1, 2, %o1 + ba,pt %xcc, csum_partial_post_align + add %o5, %o4, %o4 + + .align 32 + .globl csum_partial +csum_partial: /* %o0=buff, %o1=len, %o2=sum */ + prefetch [%o0 + 0x000], #n_reads + clr %o4 + prefetch [%o0 + 0x040], #n_reads + brz,pn %o1, csum_partial_finish + andcc %o0, 0x3, %g0 + + /* We "remember" whether the lowest bit in the address + * was set in %g7. Because if it is, we have to swap + * upper and lower 8 bit fields of the sum we calculate. + */ + bne,pn %icc, csum_partial_fix_alignment + andcc %o0, 0x1, %g7 + +csum_partial_post_align: + prefetch [%o0 + 0x080], #n_reads + andncc %o1, 0x3f, %o3 + + prefetch [%o0 + 0x0c0], #n_reads + sub %o1, %o3, %o1 + brz,pn %o3, 2f + prefetch [%o0 + 0x100], #n_reads + + /* So that we don't need to use the non-pairing + * add-with-carry instructions we accumulate 32-bit + * values into a 64-bit register. At the end of the + * loop we fold it down to 32-bits and so on. + */ + prefetch [%o0 + 0x140], #n_reads +1: lduw [%o0 + 0x00], %o5 + lduw [%o0 + 0x04], %g1 + lduw [%o0 + 0x08], %g2 + add %o4, %o5, %o4 + lduw [%o0 + 0x0c], %g3 + add %o4, %g1, %o4 + lduw [%o0 + 0x10], %o5 + add %o4, %g2, %o4 + lduw [%o0 + 0x14], %g1 + add %o4, %g3, %o4 + lduw [%o0 + 0x18], %g2 + add %o4, %o5, %o4 + lduw [%o0 + 0x1c], %g3 + add %o4, %g1, %o4 + lduw [%o0 + 0x20], %o5 + add %o4, %g2, %o4 + lduw [%o0 + 0x24], %g1 + add %o4, %g3, %o4 + lduw [%o0 + 0x28], %g2 + add %o4, %o5, %o4 + lduw [%o0 + 0x2c], %g3 + add %o4, %g1, %o4 + lduw [%o0 + 0x30], %o5 + add %o4, %g2, %o4 + lduw [%o0 + 0x34], %g1 + add %o4, %g3, %o4 + lduw [%o0 + 0x38], %g2 + add %o4, %o5, %o4 + lduw [%o0 + 0x3c], %g3 + add %o4, %g1, %o4 + prefetch [%o0 + 0x180], #n_reads + add %o4, %g2, %o4 + subcc %o3, 0x40, %o3 + add %o0, 0x40, %o0 + bne,pt %icc, 1b + add %o4, %g3, %o4 + +2: and %o1, 0x3c, %o3 + brz,pn %o3, 2f + sub %o1, %o3, %o1 +1: lduw [%o0 + 0x00], %o5 + subcc %o3, 0x4, %o3 + add %o0, 0x4, %o0 + bne,pt %icc, 1b + add %o4, %o5, %o4 + +2: + /* fold 64-->32 */ + srlx %o4, 32, %o5 + srl %o4, 0, %o4 + add %o4, %o5, %o4 + srlx %o4, 32, %o5 + srl %o4, 0, %o4 + add %o4, %o5, %o4 + + /* fold 32-->16 */ + sethi %hi(0xffff0000), %g1 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + +csum_partial_end_cruft: + /* %o4 has the 16-bit sum we have calculated so-far. */ + cmp %o1, 2 + blu,pt %icc, 1f + nop + lduh [%o0 + 0x00], %o5 + sub %o1, 2, %o1 + add %o0, 2, %o0 + add %o4, %o5, %o4 +1: brz,pt %o1, 1f + nop + ldub [%o0 + 0x00], %o5 + sub %o1, 1, %o1 + add %o0, 1, %o0 + sllx %o5, 8, %o5 + add %o4, %o5, %o4 +1: + /* fold 32-->16 */ + sethi %hi(0xffff0000), %g1 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + +1: brz,pt %g7, 1f + nop + + /* We started with an odd byte, byte-swap the result. */ + srl %o4, 8, %o5 + and %o4, 0xff, %g1 + sll %g1, 8, %g1 + or %o5, %g1, %o4 + +1: addcc %o2, %o4, %o2 + addc %g0, %o2, %o2 + +csum_partial_finish: + retl + srl %o2, 0, %o0 diff --git a/arch/sparc/lib/clear_page.S b/arch/sparc/lib/clear_page.S new file mode 100644 index 00000000000..46272dfc26e --- /dev/null +++ b/arch/sparc/lib/clear_page.S @@ -0,0 +1,103 @@ +/* clear_page.S: UltraSparc optimized clear page. + * + * Copyright (C) 1996, 1998, 1999, 2000, 2004 David S. Miller (davem@redhat.com) + * Copyright (C) 1997 Jakub Jelinek (jakub@redhat.com) + */ + +#include <asm/visasm.h> +#include <asm/thread_info.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/spitfire.h> +#include <asm/head.h> + + /* What we used to do was lock a TLB entry into a specific + * TLB slot, clear the page with interrupts disabled, then + * restore the original TLB entry. This was great for + * disturbing the TLB as little as possible, but it meant + * we had to keep interrupts disabled for a long time. + * + * Now, we simply use the normal TLB loading mechanism, + * and this makes the cpu choose a slot all by itself. + * Then we do a normal TLB flush on exit. We need only + * disable preemption during the clear. + */ + + .text + + .globl _clear_page +_clear_page: /* %o0=dest */ + ba,pt %xcc, clear_page_common + clr %o4 + + /* This thing is pretty important, it shows up + * on the profiles via do_anonymous_page(). + */ + .align 32 + .globl clear_user_page +clear_user_page: /* %o0=dest, %o1=vaddr */ + lduw [%g6 + TI_PRE_COUNT], %o2 + sethi %hi(PAGE_OFFSET), %g2 + sethi %hi(PAGE_SIZE), %o4 + + ldx [%g2 + %lo(PAGE_OFFSET)], %g2 + sethi %hi(PAGE_KERNEL_LOCKED), %g3 + + ldx [%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3 + sub %o0, %g2, %g1 ! paddr + + and %o1, %o4, %o0 ! vaddr D-cache alias bit + + or %g1, %g3, %g1 ! TTE data + sethi %hi(TLBTEMP_BASE), %o3 + + add %o2, 1, %o4 + add %o0, %o3, %o0 ! TTE vaddr + + /* Disable preemption. */ + mov TLB_TAG_ACCESS, %g3 + stw %o4, [%g6 + TI_PRE_COUNT] + + /* Load TLB entry. */ + rdpr %pstate, %o4 + wrpr %o4, PSTATE_IE, %pstate + stxa %o0, [%g3] ASI_DMMU + stxa %g1, [%g0] ASI_DTLB_DATA_IN + sethi %hi(KERNBASE), %g1 + flush %g1 + wrpr %o4, 0x0, %pstate + + mov 1, %o4 + +clear_page_common: + VISEntryHalf + membar #StoreLoad | #StoreStore | #LoadStore + fzero %f0 + sethi %hi(PAGE_SIZE/64), %o1 + mov %o0, %g1 ! remember vaddr for tlbflush + fzero %f2 + or %o1, %lo(PAGE_SIZE/64), %o1 + faddd %f0, %f2, %f4 + fmuld %f0, %f2, %f6 + faddd %f0, %f2, %f8 + fmuld %f0, %f2, %f10 + + faddd %f0, %f2, %f12 + fmuld %f0, %f2, %f14 +1: stda %f0, [%o0 + %g0] ASI_BLK_P + subcc %o1, 1, %o1 + bne,pt %icc, 1b + add %o0, 0x40, %o0 + membar #Sync + VISExitHalf + + brz,pn %o4, out + nop + + stxa %g0, [%g1] ASI_DMMU_DEMAP + membar #Sync + stw %o2, [%g6 + TI_PRE_COUNT] + +out: retl + nop + diff --git a/arch/sparc/lib/copy_in_user.S b/arch/sparc/lib/copy_in_user.S new file mode 100644 index 00000000000..302c0e60dc2 --- /dev/null +++ b/arch/sparc/lib/copy_in_user.S @@ -0,0 +1,92 @@ +/* copy_in_user.S: Copy from userspace to userspace. + * + * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) + */ + +#include <linux/linkage.h> +#include <asm/asi.h> + +#define XCC xcc + +#define EX(x,y) \ +98: x,y; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, __retl_one; \ + .text; \ + .align 4; + + .register %g2,#scratch + .register %g3,#scratch + + .text + .align 32 + + /* Don't try to get too fancy here, just nice and + * simple. This is predominantly used for well aligned + * small copies in the compat layer. It is also used + * to copy register windows around during thread cloning. + */ + +ENTRY(___copy_in_user) /* %o0=dst, %o1=src, %o2=len */ + cmp %o2, 0 + be,pn %XCC, 85f + or %o0, %o1, %o3 + cmp %o2, 16 + bleu,a,pn %XCC, 80f + or %o3, %o2, %o3 + + /* 16 < len <= 64 */ + andcc %o3, 0x7, %g0 + bne,pn %XCC, 90f + nop + + andn %o2, 0x7, %o4 + and %o2, 0x7, %o2 +1: subcc %o4, 0x8, %o4 + EX(ldxa [%o1] %asi, %o5) + EX(stxa %o5, [%o0] %asi) + add %o1, 0x8, %o1 + bgu,pt %XCC, 1b + add %o0, 0x8, %o0 + andcc %o2, 0x4, %g0 + be,pt %XCC, 1f + nop + sub %o2, 0x4, %o2 + EX(lduwa [%o1] %asi, %o5) + EX(stwa %o5, [%o0] %asi) + add %o1, 0x4, %o1 + add %o0, 0x4, %o0 +1: cmp %o2, 0 + be,pt %XCC, 85f + nop + ba,pt %xcc, 90f + nop + +80: /* 0 < len <= 16 */ + andcc %o3, 0x3, %g0 + bne,pn %XCC, 90f + nop + +82: + subcc %o2, 4, %o2 + EX(lduwa [%o1] %asi, %g1) + EX(stwa %g1, [%o0] %asi) + add %o1, 4, %o1 + bgu,pt %XCC, 82b + add %o0, 4, %o0 + +85: retl + clr %o0 + + .align 32 +90: + subcc %o2, 1, %o2 + EX(lduba [%o1] %asi, %g1) + EX(stba %g1, [%o0] %asi) + add %o1, 1, %o1 + bgu,pt %XCC, 90b + add %o0, 1, %o0 + retl + clr %o0 +ENDPROC(___copy_in_user) diff --git a/arch/sparc/lib/copy_page.S b/arch/sparc/lib/copy_page.S new file mode 100644 index 00000000000..dd16c61f326 --- /dev/null +++ b/arch/sparc/lib/copy_page.S @@ -0,0 +1,250 @@ +/* clear_page.S: UltraSparc optimized copy page. + * + * Copyright (C) 1996, 1998, 1999, 2000, 2004 David S. Miller (davem@redhat.com) + * Copyright (C) 1997 Jakub Jelinek (jakub@redhat.com) + */ + +#include <asm/visasm.h> +#include <asm/thread_info.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/spitfire.h> +#include <asm/head.h> + + /* What we used to do was lock a TLB entry into a specific + * TLB slot, clear the page with interrupts disabled, then + * restore the original TLB entry. This was great for + * disturbing the TLB as little as possible, but it meant + * we had to keep interrupts disabled for a long time. + * + * Now, we simply use the normal TLB loading mechanism, + * and this makes the cpu choose a slot all by itself. + * Then we do a normal TLB flush on exit. We need only + * disable preemption during the clear. + */ + +#define DCACHE_SIZE (PAGE_SIZE * 2) + +#if (PAGE_SHIFT == 13) +#define PAGE_SIZE_REM 0x80 +#elif (PAGE_SHIFT == 16) +#define PAGE_SIZE_REM 0x100 +#else +#error Wrong PAGE_SHIFT specified +#endif + +#define TOUCH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7) \ + fsrc2 %reg0, %f48; fsrc2 %reg1, %f50; \ + fsrc2 %reg2, %f52; fsrc2 %reg3, %f54; \ + fsrc2 %reg4, %f56; fsrc2 %reg5, %f58; \ + fsrc2 %reg6, %f60; fsrc2 %reg7, %f62; + + .text + + .align 32 + .globl copy_user_page + .type copy_user_page,#function +copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ + lduw [%g6 + TI_PRE_COUNT], %o4 + sethi %hi(PAGE_OFFSET), %g2 + sethi %hi(PAGE_SIZE), %o3 + + ldx [%g2 + %lo(PAGE_OFFSET)], %g2 + sethi %hi(PAGE_KERNEL_LOCKED), %g3 + + ldx [%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3 + sub %o0, %g2, %g1 ! dest paddr + + sub %o1, %g2, %g2 ! src paddr + + and %o2, %o3, %o0 ! vaddr D-cache alias bit + or %g1, %g3, %g1 ! dest TTE data + + or %g2, %g3, %g2 ! src TTE data + sethi %hi(TLBTEMP_BASE), %o3 + + sethi %hi(DCACHE_SIZE), %o1 + add %o0, %o3, %o0 ! dest TTE vaddr + + add %o4, 1, %o2 + add %o0, %o1, %o1 ! src TTE vaddr + + /* Disable preemption. */ + mov TLB_TAG_ACCESS, %g3 + stw %o2, [%g6 + TI_PRE_COUNT] + + /* Load TLB entries. */ + rdpr %pstate, %o2 + wrpr %o2, PSTATE_IE, %pstate + stxa %o0, [%g3] ASI_DMMU + stxa %g1, [%g0] ASI_DTLB_DATA_IN + membar #Sync + stxa %o1, [%g3] ASI_DMMU + stxa %g2, [%g0] ASI_DTLB_DATA_IN + membar #Sync + wrpr %o2, 0x0, %pstate + +cheetah_copy_page_insn: + ba,pt %xcc, 9f + nop + +1: + VISEntryHalf + membar #StoreLoad | #StoreStore | #LoadStore + sethi %hi((PAGE_SIZE/64)-2), %o2 + mov %o0, %g1 + prefetch [%o1 + 0x000], #one_read + or %o2, %lo((PAGE_SIZE/64)-2), %o2 + prefetch [%o1 + 0x040], #one_read + prefetch [%o1 + 0x080], #one_read + prefetch [%o1 + 0x0c0], #one_read + ldd [%o1 + 0x000], %f0 + prefetch [%o1 + 0x100], #one_read + ldd [%o1 + 0x008], %f2 + prefetch [%o1 + 0x140], #one_read + ldd [%o1 + 0x010], %f4 + prefetch [%o1 + 0x180], #one_read + fsrc2 %f0, %f16 + ldd [%o1 + 0x018], %f6 + fsrc2 %f2, %f18 + ldd [%o1 + 0x020], %f8 + fsrc2 %f4, %f20 + ldd [%o1 + 0x028], %f10 + fsrc2 %f6, %f22 + ldd [%o1 + 0x030], %f12 + fsrc2 %f8, %f24 + ldd [%o1 + 0x038], %f14 + fsrc2 %f10, %f26 + ldd [%o1 + 0x040], %f0 +1: ldd [%o1 + 0x048], %f2 + fsrc2 %f12, %f28 + ldd [%o1 + 0x050], %f4 + fsrc2 %f14, %f30 + stda %f16, [%o0] ASI_BLK_P + ldd [%o1 + 0x058], %f6 + fsrc2 %f0, %f16 + ldd [%o1 + 0x060], %f8 + fsrc2 %f2, %f18 + ldd [%o1 + 0x068], %f10 + fsrc2 %f4, %f20 + ldd [%o1 + 0x070], %f12 + fsrc2 %f6, %f22 + ldd [%o1 + 0x078], %f14 + fsrc2 %f8, %f24 + ldd [%o1 + 0x080], %f0 + prefetch [%o1 + 0x180], #one_read + fsrc2 %f10, %f26 + subcc %o2, 1, %o2 + add %o0, 0x40, %o0 + bne,pt %xcc, 1b + add %o1, 0x40, %o1 + + ldd [%o1 + 0x048], %f2 + fsrc2 %f12, %f28 + ldd [%o1 + 0x050], %f4 + fsrc2 %f14, %f30 + stda %f16, [%o0] ASI_BLK_P + ldd [%o1 + 0x058], %f6 + fsrc2 %f0, %f16 + ldd [%o1 + 0x060], %f8 + fsrc2 %f2, %f18 + ldd [%o1 + 0x068], %f10 + fsrc2 %f4, %f20 + ldd [%o1 + 0x070], %f12 + fsrc2 %f6, %f22 + add %o0, 0x40, %o0 + ldd [%o1 + 0x078], %f14 + fsrc2 %f8, %f24 + fsrc2 %f10, %f26 + fsrc2 %f12, %f28 + fsrc2 %f14, %f30 + stda %f16, [%o0] ASI_BLK_P + membar #Sync + VISExitHalf + ba,pt %xcc, 5f + nop + +9: + VISEntry + ldub [%g6 + TI_FAULT_CODE], %g3 + mov %o0, %g1 + cmp %g3, 0 + rd %asi, %g3 + be,a,pt %icc, 1f + wr %g0, ASI_BLK_P, %asi + wr %g0, ASI_BLK_COMMIT_P, %asi +1: ldda [%o1] ASI_BLK_P, %f0 + add %o1, 0x40, %o1 + ldda [%o1] ASI_BLK_P, %f16 + add %o1, 0x40, %o1 + sethi %hi(PAGE_SIZE), %o2 +1: TOUCH(f0, f2, f4, f6, f8, f10, f12, f14) + ldda [%o1] ASI_BLK_P, %f32 + stda %f48, [%o0] %asi + add %o1, 0x40, %o1 + sub %o2, 0x40, %o2 + add %o0, 0x40, %o0 + TOUCH(f16, f18, f20, f22, f24, f26, f28, f30) + ldda [%o1] ASI_BLK_P, %f0 + stda %f48, [%o0] %asi + add %o1, 0x40, %o1 + sub %o2, 0x40, %o2 + add %o0, 0x40, %o0 + TOUCH(f32, f34, f36, f38, f40, f42, f44, f46) + ldda [%o1] ASI_BLK_P, %f16 + stda %f48, [%o0] %asi + sub %o2, 0x40, %o2 + add %o1, 0x40, %o1 + cmp %o2, PAGE_SIZE_REM + bne,pt %xcc, 1b + add %o0, 0x40, %o0 +#if (PAGE_SHIFT == 16) + TOUCH(f0, f2, f4, f6, f8, f10, f12, f14) + ldda [%o1] ASI_BLK_P, %f32 + stda %f48, [%o0] %asi + add %o1, 0x40, %o1 + sub %o2, 0x40, %o2 + add %o0, 0x40, %o0 + TOUCH(f16, f18, f20, f22, f24, f26, f28, f30) + ldda [%o1] ASI_BLK_P, %f0 + stda %f48, [%o0] %asi + add %o1, 0x40, %o1 + sub %o2, 0x40, %o2 + add %o0, 0x40, %o0 + membar #Sync + stda %f32, [%o0] %asi + add %o0, 0x40, %o0 + stda %f0, [%o0] %asi +#else + membar #Sync + stda %f0, [%o0] %asi + add %o0, 0x40, %o0 + stda %f16, [%o0] %asi +#endif + membar #Sync + wr %g3, 0x0, %asi + VISExit + +5: + stxa %g0, [%g1] ASI_DMMU_DEMAP + membar #Sync + + sethi %hi(DCACHE_SIZE), %g2 + stxa %g0, [%g1 + %g2] ASI_DMMU_DEMAP + membar #Sync + + retl + stw %o4, [%g6 + TI_PRE_COUNT] + + .size copy_user_page, .-copy_user_page + + .globl cheetah_patch_copy_page +cheetah_patch_copy_page: + sethi %hi(0x01000000), %o1 ! NOP + sethi %hi(cheetah_copy_page_insn), %o0 + or %o0, %lo(cheetah_copy_page_insn), %o0 + stw %o1, [%o0] + membar #StoreStore + flush %o0 + retl + nop diff --git a/arch/sparc/lib/csum_copy.S b/arch/sparc/lib/csum_copy.S new file mode 100644 index 00000000000..e566c770a0f --- /dev/null +++ b/arch/sparc/lib/csum_copy.S @@ -0,0 +1,309 @@ +/* csum_copy.S: Checksum+copy code for sparc64 + * + * Copyright (C) 2005 David S. Miller <davem@davemloft.net> + */ + +#ifdef __KERNEL__ +#define GLOBAL_SPARE %g7 +#else +#define GLOBAL_SPARE %g5 +#endif + +#ifndef EX_LD +#define EX_LD(x) x +#endif + +#ifndef EX_ST +#define EX_ST(x) x +#endif + +#ifndef EX_RETVAL +#define EX_RETVAL(x) x +#endif + +#ifndef LOAD +#define LOAD(type,addr,dest) type [addr], dest +#endif + +#ifndef STORE +#define STORE(type,src,addr) type src, [addr] +#endif + +#ifndef FUNC_NAME +#define FUNC_NAME csum_partial_copy_nocheck +#endif + + .register %g2, #scratch + .register %g3, #scratch + + .text + +90: + /* We checked for zero length already, so there must be + * at least one byte. + */ + be,pt %icc, 1f + nop + EX_LD(LOAD(ldub, %o0 + 0x00, %o4)) + add %o0, 1, %o0 + sub %o2, 1, %o2 + EX_ST(STORE(stb, %o4, %o1 + 0x00)) + add %o1, 1, %o1 +1: andcc %o0, 0x2, %g0 + be,pn %icc, 80f + cmp %o2, 2 + blu,pn %icc, 60f + nop + EX_LD(LOAD(lduh, %o0 + 0x00, %o5)) + add %o0, 2, %o0 + sub %o2, 2, %o2 + EX_ST(STORE(sth, %o5, %o1 + 0x00)) + add %o1, 2, %o1 + ba,pt %xcc, 80f + add %o5, %o4, %o4 + + .globl FUNC_NAME +FUNC_NAME: /* %o0=src, %o1=dst, %o2=len, %o3=sum */ + LOAD(prefetch, %o0 + 0x000, #n_reads) + xor %o0, %o1, %g1 + clr %o4 + andcc %g1, 0x3, %g0 + bne,pn %icc, 95f + LOAD(prefetch, %o0 + 0x040, #n_reads) + + brz,pn %o2, 70f + andcc %o0, 0x3, %g0 + + /* We "remember" whether the lowest bit in the address + * was set in GLOBAL_SPARE. Because if it is, we have to swap + * upper and lower 8 bit fields of the sum we calculate. + */ + bne,pn %icc, 90b + andcc %o0, 0x1, GLOBAL_SPARE + +80: + LOAD(prefetch, %o0 + 0x080, #n_reads) + andncc %o2, 0x3f, %g3 + + LOAD(prefetch, %o0 + 0x0c0, #n_reads) + sub %o2, %g3, %o2 + brz,pn %g3, 2f + LOAD(prefetch, %o0 + 0x100, #n_reads) + + /* So that we don't need to use the non-pairing + * add-with-carry instructions we accumulate 32-bit + * values into a 64-bit register. At the end of the + * loop we fold it down to 32-bits and so on. + */ + ba,pt %xcc, 1f + LOAD(prefetch, %o0 + 0x140, #n_reads) + + .align 32 +1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5)) + EX_LD(LOAD(lduw, %o0 + 0x04, %g1)) + EX_LD(LOAD(lduw, %o0 + 0x08, %g2)) + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x00)) + EX_LD(LOAD(lduw, %o0 + 0x0c, %o5)) + add %o4, %g1, %o4 + EX_ST(STORE(stw, %g1, %o1 + 0x04)) + EX_LD(LOAD(lduw, %o0 + 0x10, %g1)) + add %o4, %g2, %o4 + EX_ST(STORE(stw, %g2, %o1 + 0x08)) + EX_LD(LOAD(lduw, %o0 + 0x14, %g2)) + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x0c)) + EX_LD(LOAD(lduw, %o0 + 0x18, %o5)) + add %o4, %g1, %o4 + EX_ST(STORE(stw, %g1, %o1 + 0x10)) + EX_LD(LOAD(lduw, %o0 + 0x1c, %g1)) + add %o4, %g2, %o4 + EX_ST(STORE(stw, %g2, %o1 + 0x14)) + EX_LD(LOAD(lduw, %o0 + 0x20, %g2)) + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x18)) + EX_LD(LOAD(lduw, %o0 + 0x24, %o5)) + add %o4, %g1, %o4 + EX_ST(STORE(stw, %g1, %o1 + 0x1c)) + EX_LD(LOAD(lduw, %o0 + 0x28, %g1)) + add %o4, %g2, %o4 + EX_ST(STORE(stw, %g2, %o1 + 0x20)) + EX_LD(LOAD(lduw, %o0 + 0x2c, %g2)) + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x24)) + EX_LD(LOAD(lduw, %o0 + 0x30, %o5)) + add %o4, %g1, %o4 + EX_ST(STORE(stw, %g1, %o1 + 0x28)) + EX_LD(LOAD(lduw, %o0 + 0x34, %g1)) + add %o4, %g2, %o4 + EX_ST(STORE(stw, %g2, %o1 + 0x2c)) + EX_LD(LOAD(lduw, %o0 + 0x38, %g2)) + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x30)) + EX_LD(LOAD(lduw, %o0 + 0x3c, %o5)) + add %o4, %g1, %o4 + EX_ST(STORE(stw, %g1, %o1 + 0x34)) + LOAD(prefetch, %o0 + 0x180, #n_reads) + add %o4, %g2, %o4 + EX_ST(STORE(stw, %g2, %o1 + 0x38)) + subcc %g3, 0x40, %g3 + add %o0, 0x40, %o0 + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x3c)) + bne,pt %icc, 1b + add %o1, 0x40, %o1 + +2: and %o2, 0x3c, %g3 + brz,pn %g3, 2f + sub %o2, %g3, %o2 +1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5)) + subcc %g3, 0x4, %g3 + add %o0, 0x4, %o0 + add %o4, %o5, %o4 + EX_ST(STORE(stw, %o5, %o1 + 0x00)) + bne,pt %icc, 1b + add %o1, 0x4, %o1 + +2: + /* fold 64-->32 */ + srlx %o4, 32, %o5 + srl %o4, 0, %o4 + add %o4, %o5, %o4 + srlx %o4, 32, %o5 + srl %o4, 0, %o4 + add %o4, %o5, %o4 + + /* fold 32-->16 */ + sethi %hi(0xffff0000), %g1 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + +60: + /* %o4 has the 16-bit sum we have calculated so-far. */ + cmp %o2, 2 + blu,pt %icc, 1f + nop + EX_LD(LOAD(lduh, %o0 + 0x00, %o5)) + sub %o2, 2, %o2 + add %o0, 2, %o0 + add %o4, %o5, %o4 + EX_ST(STORE(sth, %o5, %o1 + 0x00)) + add %o1, 0x2, %o1 +1: brz,pt %o2, 1f + nop + EX_LD(LOAD(ldub, %o0 + 0x00, %o5)) + sub %o2, 1, %o2 + add %o0, 1, %o0 + EX_ST(STORE(stb, %o5, %o1 + 0x00)) + sllx %o5, 8, %o5 + add %o1, 1, %o1 + add %o4, %o5, %o4 +1: + /* fold 32-->16 */ + sethi %hi(0xffff0000), %g1 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + srl %o4, 16, %o5 + andn %o4, %g1, %g2 + add %o5, %g2, %o4 + +1: brz,pt GLOBAL_SPARE, 1f + nop + + /* We started with an odd byte, byte-swap the result. */ + srl %o4, 8, %o5 + and %o4, 0xff, %g1 + sll %g1, 8, %g1 + or %o5, %g1, %o4 + +1: addcc %o3, %o4, %o3 + addc %g0, %o3, %o3 + +70: + retl + srl %o3, 0, %o0 + +95: mov 0, GLOBAL_SPARE + brlez,pn %o2, 4f + andcc %o0, 1, %o5 + be,a,pt %icc, 1f + srl %o2, 1, %g1 + sub %o2, 1, %o2 + EX_LD(LOAD(ldub, %o0, GLOBAL_SPARE)) + add %o0, 1, %o0 + EX_ST(STORE(stb, GLOBAL_SPARE, %o1)) + srl %o2, 1, %g1 + add %o1, 1, %o1 +1: brz,a,pn %g1, 3f + andcc %o2, 1, %g0 + andcc %o0, 2, %g0 + be,a,pt %icc, 1f + srl %g1, 1, %g1 + EX_LD(LOAD(lduh, %o0, %o4)) + sub %o2, 2, %o2 + srl %o4, 8, %g2 + sub %g1, 1, %g1 + EX_ST(STORE(stb, %g2, %o1)) + add %o4, GLOBAL_SPARE, GLOBAL_SPARE + EX_ST(STORE(stb, %o4, %o1 + 1)) + add %o0, 2, %o0 + srl %g1, 1, %g1 + add %o1, 2, %o1 +1: brz,a,pn %g1, 2f + andcc %o2, 2, %g0 + EX_LD(LOAD(lduw, %o0, %o4)) +5: srl %o4, 24, %g2 + srl %o4, 16, %g3 + EX_ST(STORE(stb, %g2, %o1)) + srl %o4, 8, %g2 + EX_ST(STORE(stb, %g3, %o1 + 1)) + add %o0, 4, %o0 + EX_ST(STORE(stb, %g2, %o1 + 2)) + addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE + EX_ST(STORE(stb, %o4, %o1 + 3)) + addc GLOBAL_SPARE, %g0, GLOBAL_SPARE + add %o1, 4, %o1 + subcc %g1, 1, %g1 + bne,a,pt %icc, 5b + EX_LD(LOAD(lduw, %o0, %o4)) + sll GLOBAL_SPARE, 16, %g2 + srl GLOBAL_SPARE, 16, GLOBAL_SPARE + srl %g2, 16, %g2 + andcc %o2, 2, %g0 + add %g2, GLOBAL_SPARE, GLOBAL_SPARE +2: be,a,pt %icc, 3f + andcc %o2, 1, %g0 + EX_LD(LOAD(lduh, %o0, %o4)) + andcc %o2, 1, %g0 + srl %o4, 8, %g2 + add %o0, 2, %o0 + EX_ST(STORE(stb, %g2, %o1)) + add GLOBAL_SPARE, %o4, GLOBAL_SPARE + EX_ST(STORE(stb, %o4, %o1 + 1)) + add %o1, 2, %o1 +3: be,a,pt %icc, 1f + sll GLOBAL_SPARE, 16, %o4 + EX_LD(LOAD(ldub, %o0, %g2)) + sll %g2, 8, %o4 + EX_ST(STORE(stb, %g2, %o1)) + add GLOBAL_SPARE, %o4, GLOBAL_SPARE + sll GLOBAL_SPARE, 16, %o4 +1: addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE + srl GLOBAL_SPARE, 16, %o4 + addc %g0, %o4, GLOBAL_SPARE + brz,pt %o5, 4f + srl GLOBAL_SPARE, 8, %o4 + and GLOBAL_SPARE, 0xff, %g2 + and %o4, 0xff, %o4 + sll %g2, 8, %g2 + or %g2, %o4, GLOBAL_SPARE +4: addcc %o3, GLOBAL_SPARE, %o3 + addc %g0, %o3, %o0 + retl + srl %o0, 0, %o0 + .size FUNC_NAME, .-FUNC_NAME diff --git a/arch/sparc/lib/csum_copy_from_user.S b/arch/sparc/lib/csum_copy_from_user.S new file mode 100644 index 00000000000..e0304e6a224 --- /dev/null +++ b/arch/sparc/lib/csum_copy_from_user.S @@ -0,0 +1,21 @@ +/* csum_copy_from_user.S: Checksum+copy from userspace. + * + * Copyright (C) 2005 David S. Miller (davem@davemloft.net) + */ + +#define EX_LD(x) \ +98: x; \ + .section .fixup, "ax"; \ + .align 4; \ +99: retl; \ + mov -1, %o0; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; + +#define FUNC_NAME __csum_partial_copy_from_user +#define LOAD(type,addr,dest) type##a [addr] %asi, dest + +#include "csum_copy.S" diff --git a/arch/sparc/lib/csum_copy_to_user.S b/arch/sparc/lib/csum_copy_to_user.S new file mode 100644 index 00000000000..afd01acc587 --- /dev/null +++ b/arch/sparc/lib/csum_copy_to_user.S @@ -0,0 +1,21 @@ +/* csum_copy_to_user.S: Checksum+copy to userspace. + * + * Copyright (C) 2005 David S. Miller (davem@davemloft.net) + */ + +#define EX_ST(x) \ +98: x; \ + .section .fixup,"ax"; \ + .align 4; \ +99: retl; \ + mov -1, %o0; \ + .section __ex_table,"a";\ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4; + +#define FUNC_NAME __csum_partial_copy_to_user +#define STORE(type,src,addr) type##a src, [addr] %asi + +#include "csum_copy.S" diff --git a/arch/sparc/lib/divdi3.S b/arch/sparc/lib/divdi3.S index 681b3683da9..9614b48b6ef 100644 --- a/arch/sparc/lib/divdi3.S +++ b/arch/sparc/lib/divdi3.S @@ -17,21 +17,6 @@ along with GNU CC; see the file COPYING. If not, write to the Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ - .data - .align 8 - .globl __clz_tab -__clz_tab: - .byte 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 - .byte 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 - .byte 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 - .byte 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 - .byte 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 - .byte 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 - .byte 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 - .byte 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 - .size __clz_tab,256 - .global .udiv - .text .align 4 .globl __divdi3 @@ -97,8 +82,9 @@ __divdi3: bne .LL85 mov %i0,%o2 mov 1,%o0 - call .udiv,0 mov 0,%o1 + wr %g0, 0, %y + udiv %o0, %o1, %o0 mov %o0,%o4 mov %i0,%o2 .LL85: diff --git a/arch/sparc/lib/ffs.S b/arch/sparc/lib/ffs.S new file mode 100644 index 00000000000..b39389f6989 --- /dev/null +++ b/arch/sparc/lib/ffs.S @@ -0,0 +1,84 @@ +#include <linux/linkage.h> + + .register %g2,#scratch + + .text + .align 32 + +ENTRY(ffs) + brnz,pt %o0, 1f + mov 1, %o1 + retl + clr %o0 + nop + nop +ENTRY(__ffs) + sllx %o0, 32, %g1 /* 1 */ + srlx %o0, 32, %g2 + + clr %o1 /* 2 */ + movrz %g1, %g2, %o0 + + movrz %g1, 32, %o1 /* 3 */ +1: clr %o2 + + sllx %o0, (64 - 16), %g1 /* 4 */ + srlx %o0, 16, %g2 + + movrz %g1, %g2, %o0 /* 5 */ + clr %o3 + + movrz %g1, 16, %o2 /* 6 */ + clr %o4 + + and %o0, 0xff, %g1 /* 7 */ + srlx %o0, 8, %g2 + + movrz %g1, %g2, %o0 /* 8 */ + clr %o5 + + movrz %g1, 8, %o3 /* 9 */ + add %o2, %o1, %o2 + + and %o0, 0xf, %g1 /* 10 */ + srlx %o0, 4, %g2 + + movrz %g1, %g2, %o0 /* 11 */ + add %o2, %o3, %o2 + + movrz %g1, 4, %o4 /* 12 */ + + and %o0, 0x3, %g1 /* 13 */ + srlx %o0, 2, %g2 + + movrz %g1, %g2, %o0 /* 14 */ + add %o2, %o4, %o2 + + movrz %g1, 2, %o5 /* 15 */ + + and %o0, 0x1, %g1 /* 16 */ + + add %o2, %o5, %o2 /* 17 */ + xor %g1, 0x1, %g1 + + retl /* 18 */ + add %o2, %g1, %o0 +ENDPROC(ffs) +ENDPROC(__ffs) + + .section .popc_6insn_patch, "ax" + .word ffs + brz,pn %o0, 98f + neg %o0, %g1 + xnor %o0, %g1, %o1 + popc %o1, %o0 +98: retl + nop + .word __ffs + neg %o0, %g1 + xnor %o0, %g1, %o1 + popc %o1, %o0 + retl + sub %o0, 1, %o0 + nop + .previous diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S new file mode 100644 index 00000000000..95414e0a680 --- /dev/null +++ b/arch/sparc/lib/hweight.S @@ -0,0 +1,51 @@ +#include <linux/linkage.h> + + .text + .align 32 +ENTRY(__arch_hweight8) + ba,pt %xcc, __sw_hweight8 + nop + nop +ENDPROC(__arch_hweight8) + .section .popc_3insn_patch, "ax" + .word __arch_hweight8 + sllx %o0, 64-8, %g1 + retl + popc %g1, %o0 + .previous + +ENTRY(__arch_hweight16) + ba,pt %xcc, __sw_hweight16 + nop + nop +ENDPROC(__arch_hweight16) + .section .popc_3insn_patch, "ax" + .word __arch_hweight16 + sllx %o0, 64-16, %g1 + retl + popc %g1, %o0 + .previous + +ENTRY(__arch_hweight32) + ba,pt %xcc, __sw_hweight32 + nop + nop +ENDPROC(__arch_hweight32) + .section .popc_3insn_patch, "ax" + .word __arch_hweight32 + sllx %o0, 64-32, %g1 + retl + popc %g1, %o0 + .previous + +ENTRY(__arch_hweight64) + ba,pt %xcc, __sw_hweight64 + nop + nop +ENDPROC(__arch_hweight64) + .section .popc_3insn_patch, "ax" + .word __arch_hweight64 + retl + popc %o0, %o0 + nop + .previous diff --git a/arch/sparc/lib/iomap.c b/arch/sparc/lib/iomap.c index 54501c1ca78..c4d42a50ebc 100644 --- a/arch/sparc/lib/iomap.c +++ b/arch/sparc/lib/iomap.c @@ -18,31 +18,8 @@ void ioport_unmap(void __iomem *addr) EXPORT_SYMBOL(ioport_map); EXPORT_SYMBOL(ioport_unmap); -/* Create a virtual mapping cookie for a PCI BAR (memory or IO) */ -void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) -{ - unsigned long start = pci_resource_start(dev, bar); - unsigned long len = pci_resource_len(dev, bar); - unsigned long flags = pci_resource_flags(dev, bar); - - if (!len || !start) - return NULL; - if (maxlen && len > maxlen) - len = maxlen; - if (flags & IORESOURCE_IO) - return ioport_map(start, len); - if (flags & IORESOURCE_MEM) { - if (flags & IORESOURCE_CACHEABLE) - return ioremap(start, len); - return ioremap_nocache(start, len); - } - /* What? */ - return NULL; -} - void pci_iounmap(struct pci_dev *dev, void __iomem * addr) { /* nothing to do */ } -EXPORT_SYMBOL(pci_iomap); EXPORT_SYMBOL(pci_iounmap); diff --git a/arch/sparc/lib/ipcsum.S b/arch/sparc/lib/ipcsum.S new file mode 100644 index 00000000000..4742d59029e --- /dev/null +++ b/arch/sparc/lib/ipcsum.S @@ -0,0 +1,33 @@ +#include <linux/linkage.h> + + .text +ENTRY(ip_fast_csum) /* %o0 = iph, %o1 = ihl */ + sub %o1, 4, %g7 + lduw [%o0 + 0x00], %o2 + lduw [%o0 + 0x04], %g2 + lduw [%o0 + 0x08], %g3 + addcc %g2, %o2, %o2 + lduw [%o0 + 0x0c], %g2 + addccc %g3, %o2, %o2 + lduw [%o0 + 0x10], %g3 + + addccc %g2, %o2, %o2 + addc %o2, %g0, %o2 +1: addcc %g3, %o2, %o2 + add %o0, 4, %o0 + addccc %o2, %g0, %o2 + subcc %g7, 1, %g7 + be,a,pt %icc, 2f + sll %o2, 16, %g2 + + lduw [%o0 + 0x10], %g3 + ba,pt %xcc, 1b + nop +2: addcc %o2, %g2, %g2 + srl %g2, 16, %o2 + addc %o2, %g0, %o2 + xnor %g0, %o2, %o2 + set 0xffff, %o1 + retl + and %o2, %o1, %o0 +ENDPROC(ip_fast_csum) diff --git a/arch/sparc/lib/ksyms.c b/arch/sparc/lib/ksyms.c new file mode 100644 index 00000000000..323335b9cd2 --- /dev/null +++ b/arch/sparc/lib/ksyms.c @@ -0,0 +1,157 @@ +/* + * Export of symbols defined in assembler + */ + +/* Tell string.h we don't want memcpy etc. as cpp defines */ +#define EXPORT_SYMTAB_STROPS + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/types.h> + +#include <asm/checksum.h> +#include <asm/uaccess.h> +#include <asm/ftrace.h> + +/* string functions */ +EXPORT_SYMBOL(strlen); +EXPORT_SYMBOL(strncmp); + +/* mem* functions */ +extern void *__memscan_zero(void *, size_t); +extern void *__memscan_generic(void *, int, size_t); +extern void *__bzero(void *, size_t); + +EXPORT_SYMBOL(memscan); +EXPORT_SYMBOL(__memscan_zero); +EXPORT_SYMBOL(__memscan_generic); +EXPORT_SYMBOL(memcmp); +EXPORT_SYMBOL(memcpy); +EXPORT_SYMBOL(memset); +EXPORT_SYMBOL(memmove); +EXPORT_SYMBOL(__bzero); + +/* Networking helper routines. */ +EXPORT_SYMBOL(csum_partial); + +#ifdef CONFIG_MCOUNT +EXPORT_SYMBOL(_mcount); +#endif + +/* + * sparc + */ +#ifdef CONFIG_SPARC32 +extern int __ashrdi3(int, int); +extern int __ashldi3(int, int); +extern int __lshrdi3(int, int); +extern int __muldi3(int, int); +extern int __divdi3(int, int); + +extern void (*__copy_1page)(void *, const void *); +extern void (*bzero_1page)(void *); + +extern void ___rw_read_enter(void); +extern void ___rw_read_try(void); +extern void ___rw_read_exit(void); +extern void ___rw_write_enter(void); + +/* Networking helper routines. */ +EXPORT_SYMBOL(__csum_partial_copy_sparc_generic); + +/* Special internal versions of library functions. */ +EXPORT_SYMBOL(__copy_1page); +EXPORT_SYMBOL(__memmove); +EXPORT_SYMBOL(bzero_1page); + +/* Moving data to/from/in userspace. */ +EXPORT_SYMBOL(__copy_user); + +/* Used by asm/spinlock.h */ +#ifdef CONFIG_SMP +EXPORT_SYMBOL(___rw_read_enter); +EXPORT_SYMBOL(___rw_read_try); +EXPORT_SYMBOL(___rw_read_exit); +EXPORT_SYMBOL(___rw_write_enter); +#endif + +EXPORT_SYMBOL(__ashrdi3); +EXPORT_SYMBOL(__ashldi3); +EXPORT_SYMBOL(__lshrdi3); +EXPORT_SYMBOL(__muldi3); +EXPORT_SYMBOL(__divdi3); +#endif + +/* + * sparc64 + */ +#ifdef CONFIG_SPARC64 +/* Networking helper routines. */ +EXPORT_SYMBOL(csum_partial_copy_nocheck); +EXPORT_SYMBOL(__csum_partial_copy_from_user); +EXPORT_SYMBOL(__csum_partial_copy_to_user); +EXPORT_SYMBOL(ip_fast_csum); + +/* Moving data to/from/in userspace. */ +EXPORT_SYMBOL(___copy_to_user); +EXPORT_SYMBOL(___copy_from_user); +EXPORT_SYMBOL(___copy_in_user); +EXPORT_SYMBOL(__clear_user); + +/* Atomic counter implementation. */ +EXPORT_SYMBOL(atomic_add); +EXPORT_SYMBOL(atomic_add_ret); +EXPORT_SYMBOL(atomic_sub); +EXPORT_SYMBOL(atomic_sub_ret); +EXPORT_SYMBOL(atomic64_add); +EXPORT_SYMBOL(atomic64_add_ret); +EXPORT_SYMBOL(atomic64_sub); +EXPORT_SYMBOL(atomic64_sub_ret); +EXPORT_SYMBOL(atomic64_dec_if_positive); + +/* Atomic bit operations. */ +EXPORT_SYMBOL(test_and_set_bit); +EXPORT_SYMBOL(test_and_clear_bit); +EXPORT_SYMBOL(test_and_change_bit); +EXPORT_SYMBOL(set_bit); +EXPORT_SYMBOL(clear_bit); +EXPORT_SYMBOL(change_bit); + +/* Special internal versions of library functions. */ +EXPORT_SYMBOL(_clear_page); +EXPORT_SYMBOL(clear_user_page); +EXPORT_SYMBOL(copy_user_page); + +/* RAID code needs this */ +void VISenter(void); +EXPORT_SYMBOL(VISenter); + +/* CRYPTO code needs this */ +void VISenterhalf(void); +EXPORT_SYMBOL(VISenterhalf); + +extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *); +extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *, + unsigned long *); +extern void xor_vis_4(unsigned long, unsigned long *, unsigned long *, + unsigned long *, unsigned long *); +extern void xor_vis_5(unsigned long, unsigned long *, unsigned long *, + unsigned long *, unsigned long *, unsigned long *); +EXPORT_SYMBOL(xor_vis_2); +EXPORT_SYMBOL(xor_vis_3); +EXPORT_SYMBOL(xor_vis_4); +EXPORT_SYMBOL(xor_vis_5); + +extern void xor_niagara_2(unsigned long, unsigned long *, unsigned long *); +extern void xor_niagara_3(unsigned long, unsigned long *, unsigned long *, + unsigned long *); +extern void xor_niagara_4(unsigned long, unsigned long *, unsigned long *, + unsigned long *, unsigned long *); +extern void xor_niagara_5(unsigned long, unsigned long *, unsigned long *, + unsigned long *, unsigned long *, unsigned long *); + +EXPORT_SYMBOL(xor_niagara_2); +EXPORT_SYMBOL(xor_niagara_3); +EXPORT_SYMBOL(xor_niagara_4); +EXPORT_SYMBOL(xor_niagara_5); +#endif diff --git a/arch/sparc/lib/locks.S b/arch/sparc/lib/locks.S index b1df55cb221..64f53f2b673 100644 --- a/arch/sparc/lib/locks.S +++ b/arch/sparc/lib/locks.S @@ -1,4 +1,4 @@ -/* $Id: locks.S,v 1.16 2000/02/26 11:02:47 anton Exp $ +/* * locks.S: SMP low-level lock primitives on Sparc. * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/lib/lshrdi3.S b/arch/sparc/lib/lshrdi3.S index 35abf5b2bd1..60ebc7cdbee 100644 --- a/arch/sparc/lib/lshrdi3.S +++ b/arch/sparc/lib/lshrdi3.S @@ -1,7 +1,6 @@ -/* $Id: lshrdi3.S,v 1.1 1999/03/21 06:37:45 davem Exp $ */ +#include <linux/linkage.h> - .globl __lshrdi3 -__lshrdi3: +ENTRY(__lshrdi3) cmp %o2, 0 be 3f mov 0x20, %g2 @@ -25,3 +24,4 @@ __lshrdi3: 3: retl nop +ENDPROC(__lshrdi3) diff --git a/arch/sparc/lib/mcount.S b/arch/sparc/lib/mcount.S new file mode 100644 index 00000000000..3ad6cbdc216 --- /dev/null +++ b/arch/sparc/lib/mcount.S @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2000 Anton Blanchard (anton@linuxcare.com) + * + * This file implements mcount(), which is used to collect profiling data. + * This can also be tweaked for kernel stack overflow detection. + */ + +#include <linux/linkage.h> + +/* + * This is the main variant and is called by C code. GCC's -pg option + * automatically instruments every C function with a call to this. + */ + + .text + .align 32 + .globl _mcount + .type _mcount,#function + .globl mcount + .type mcount,#function +_mcount: +mcount: +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + /* Do nothing, the retl/nop below is all we need. */ +#else + sethi %hi(function_trace_stop), %g1 + lduw [%g1 + %lo(function_trace_stop)], %g2 + brnz,pn %g2, 2f + sethi %hi(ftrace_trace_function), %g1 + sethi %hi(ftrace_stub), %g2 + ldx [%g1 + %lo(ftrace_trace_function)], %g1 + or %g2, %lo(ftrace_stub), %g2 + cmp %g1, %g2 + be,pn %icc, 1f + mov %i7, %g3 + save %sp, -176, %sp + mov %g3, %o1 + jmpl %g1, %o7 + mov %i7, %o0 + ret + restore + /* not reached */ +1: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + sethi %hi(ftrace_graph_return), %g1 + ldx [%g1 + %lo(ftrace_graph_return)], %g3 + cmp %g2, %g3 + bne,pn %xcc, 5f + sethi %hi(ftrace_graph_entry_stub), %g2 + sethi %hi(ftrace_graph_entry), %g1 + or %g2, %lo(ftrace_graph_entry_stub), %g2 + ldx [%g1 + %lo(ftrace_graph_entry)], %g1 + cmp %g1, %g2 + be,pt %xcc, 2f + nop +5: mov %i7, %g2 + mov %fp, %g3 + save %sp, -176, %sp + mov %g2, %l0 + ba,pt %xcc, ftrace_graph_caller + mov %g3, %l1 +#endif +2: +#endif +#endif + retl + nop + .size _mcount,.-_mcount + .size mcount,.-mcount + +#ifdef CONFIG_FUNCTION_TRACER + .globl ftrace_stub + .type ftrace_stub,#function +ftrace_stub: + retl + nop + .size ftrace_stub,.-ftrace_stub +#ifdef CONFIG_DYNAMIC_FTRACE + .globl ftrace_caller + .type ftrace_caller,#function +ftrace_caller: + sethi %hi(function_trace_stop), %g1 + mov %i7, %g2 + lduw [%g1 + %lo(function_trace_stop)], %g1 + brnz,pn %g1, ftrace_stub + mov %fp, %g3 + save %sp, -176, %sp + mov %g2, %o1 + mov %g2, %l0 + mov %g3, %l1 + .globl ftrace_call +ftrace_call: + call ftrace_stub + mov %i7, %o0 +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + .globl ftrace_graph_call +ftrace_graph_call: + call ftrace_stub + nop +#endif + ret + restore +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + .size ftrace_graph_call,.-ftrace_graph_call +#endif + .size ftrace_call,.-ftrace_call + .size ftrace_caller,.-ftrace_caller +#endif +#endif + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + mov %l0, %o0 + mov %i7, %o1 + call prepare_ftrace_return + mov %l1, %o2 + ret + restore %o0, -8, %i7 +END(ftrace_graph_caller) + +ENTRY(return_to_handler) + save %sp, -176, %sp + call ftrace_return_to_handler + mov %fp, %o0 + jmpl %o0 + 8, %g0 + restore +END(return_to_handler) +#endif diff --git a/arch/sparc/lib/memcmp.S b/arch/sparc/lib/memcmp.S index cb4bdb0cc2a..efa106c41ed 100644 --- a/arch/sparc/lib/memcmp.S +++ b/arch/sparc/lib/memcmp.S @@ -1,312 +1,27 @@ - .text - .align 4 - .global __memcmp, memcmp -__memcmp: -memcmp: -#if 1 - cmp %o2, 0 - ble L3 - mov 0, %g3 -L5: - ldub [%o0], %g2 - ldub [%o1], %g3 - sub %g2, %g3, %g2 - mov %g2, %g3 - sll %g2, 24, %g2 - - cmp %g2, 0 - bne L3 - add %o0, 1, %o0 +/* Sparc optimized memcmp code. + * + * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 2000, 2008 David S. Miller (davem@davemloft.net) + */ - add %o2, -1, %o2 +#include <linux/linkage.h> +#include <asm/asm.h> + .text +ENTRY(memcmp) cmp %o2, 0 - bg L5 - add %o1, 1, %o1 -L3: - sll %g3, 24, %o0 - sra %o0, 24, %o0 - - retl +1: BRANCH32(be, pn, 2f) nop -#else - save %sp, -104, %sp - mov %i2, %o4 - mov %i0, %o0 - - cmp %o4, 15 - ble L72 - mov %i1, %i2 - - andcc %i2, 3, %g0 - be L161 - andcc %o0, 3, %g2 -L75: - ldub [%o0], %g3 - ldub [%i2], %g2 - add %o0,1, %o0 - - subcc %g3, %g2, %i0 - bne L156 - add %i2, 1, %i2 - - andcc %i2, 3, %g0 - bne L75 - add %o4, -1, %o4 - - andcc %o0, 3, %g2 -L161: - bne,a L78 - mov %i2, %i1 - - mov %o0, %i5 - mov %i2, %i3 - srl %o4, 2, %i4 - - cmp %i4, 0 - bge L93 - mov %i4, %g2 - - add %i4, 3, %g2 -L93: - sra %g2, 2, %g2 - sll %g2, 2, %g2 - sub %i4, %g2, %g2 - - cmp %g2, 1 - be,a L88 - add %o0, 4, %i5 - - bg L94 - cmp %g2, 2 - - cmp %g2, 0 - be,a L86 - ld [%o0], %g3 - - b L162 - ld [%i5], %g3 -L94: - be L81 - cmp %g2, 3 - - be,a L83 - add %o0, -4, %i5 - - b L162 - ld [%i5], %g3 -L81: - add %o0, -8, %i5 - ld [%o0], %g3 - add %i2, -8, %i3 - ld [%i2], %g2 - - b L82 - add %i4, 2, %i4 -L83: - ld [%o0], %g4 - add %i2, -4, %i3 - ld [%i2], %g1 - - b L84 - add %i4, 1, %i4 -L86: - b L87 - ld [%i2], %g2 -L88: - add %i2, 4, %i3 - ld [%o0], %g4 - add %i4, -1, %i4 - ld [%i2], %g1 -L95: - ld [%i5], %g3 -L162: - cmp %g4, %g1 - be L87 - ld [%i3], %g2 - - cmp %g4, %g1 -L163: - bleu L114 - mov -1, %i0 - - b L114 - mov 1, %i0 -L87: - ld [%i5 + 4], %g4 - cmp %g3, %g2 - bne L163 - ld [%i3 + 4], %g1 -L84: - ld [%i5 + 8], %g3 - - cmp %g4, %g1 - bne L163 - ld [%i3 + 8], %g2 -L82: - ld [%i5 + 12], %g4 - cmp %g3, %g2 - bne L163 - ld [%i3 + 12], %g1 - - add %i5, 16, %i5 - - addcc %i4, -4, %i4 - bne L95 - add %i3, 16, %i3 - - cmp %g4, %g1 - bne L163 - nop - - b L114 - mov 0, %i0 -L78: - srl %o4, 2, %i0 - and %o0, -4, %i3 - orcc %i0, %g0, %g3 - sll %g2, 3, %o7 - mov 32, %g2 - - bge L129 - sub %g2, %o7, %o1 - - add %i0, 3, %g3 -L129: - sra %g3, 2, %g2 - sll %g2, 2, %g2 - sub %i0, %g2, %g2 - - cmp %g2, 1 - be,a L124 - ld [%i3], %o3 - - bg L130 - cmp %g2, 2 - - cmp %g2, 0 - be,a L122 - ld [%i3], %o2 - - b L164 - sll %o3, %o7, %g3 -L130: - be L117 - cmp %g2, 3 - - be,a L119 - ld [%i3], %g1 - - b L164 - sll %o3, %o7, %g3 -L117: - ld [%i3], %g4 - add %i2, -8, %i1 - ld [%i3 + 4], %o3 - add %i0, 2, %i0 - ld [%i2], %i4 - - b L118 - add %i3, -4, %i3 -L119: - ld [%i3 + 4], %g4 - add %i2, -4, %i1 - ld [%i2], %i5 - - b L120 - add %i0, 1, %i0 -L122: - ld [%i3 + 4], %g1 - ld [%i2], %i4 - - b L123 - add %i3, 4, %i3 -L124: - add %i2, 4, %i1 - ld [%i3 + 4], %o2 - add %i0, -1, %i0 - ld [%i2], %i5 - add %i3, 8, %i3 -L131: - sll %o3, %o7, %g3 -L164: - srl %o2, %o1, %g2 - ld [%i3], %g1 - or %g3, %g2, %g3 - - cmp %g3, %i5 - bne L163 - ld [%i1], %i4 -L123: - sll %o2, %o7, %g3 - srl %g1, %o1, %g2 - ld [%i3 + 4], %g4 - or %g3, %g2, %g3 - - cmp %g3, %i4 - bne L163 - ld [%i1 + 4], %i5 -L120: - sll %g1, %o7, %g3 - srl %g4, %o1, %g2 - ld [%i3 + 8], %o3 - or %g3, %g2, %g3 - - cmp %g3, %i5 - bne L163 - ld [%i1 + 8], %i4 -L118: - sll %g4, %o7, %g3 - srl %o3, %o1, %g2 - ld [%i3 + 12], %o2 - or %g3, %g2, %g3 - - cmp %g3, %i4 - bne L163 - ld [%i1 + 12], %i5 - - add %i3, 16, %i3 - addcc %i0, -4, %i0 - bne L131 - add %i1, 16, %i1 - - sll %o3, %o7, %g3 - srl %o2, %o1, %g2 - or %g3, %g2, %g3 - - cmp %g3, %i5 - be,a L114 - mov 0, %i0 - - b,a L163 -L114: - cmp %i0, 0 - bne L156 - and %o4, -4, %g2 - - add %o0, %g2, %o0 - add %i2, %g2, %i2 - and %o4, 3, %o4 -L72: - cmp %o4, 0 - be L156 - mov 0, %i0 - - ldub [%o0], %g3 -L165: - ldub [%i2], %g2 + ldub [%o0], %g7 + ldub [%o1], %g3 + sub %o2, 1, %o2 add %o0, 1, %o0 - - subcc %g3, %g2, %i0 - bne L156 - add %i2, 1, %i2 - - addcc %o4, -1, %o4 - bne,a L165 - ldub [%o0], %g3 - - mov 0, %i0 -L156: - ret - restore -#endif + add %o1, 1, %o1 + subcc %g7, %g3, %g3 + BRANCH32(be, pt, 1b) + cmp %o2, 0 + retl + mov %g3, %o0 +2: retl + mov 0, %o0 +ENDPROC(memcmp) diff --git a/arch/sparc/lib/memcpy.S b/arch/sparc/lib/memcpy.S index ce10bc869af..4d8c497517b 100644 --- a/arch/sparc/lib/memcpy.S +++ b/arch/sparc/lib/memcpy.S @@ -7,40 +7,12 @@ * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz) */ -#ifdef __KERNEL__ - -#define FUNC(x) \ +#define FUNC(x) \ .globl x; \ .type x,@function; \ - .align 4; \ + .align 4; \ x: -#undef FASTER_REVERSE -#undef FASTER_NONALIGNED -#define FASTER_ALIGNED - -/* In kernel these functions don't return a value. - * One should use macros in asm/string.h for that purpose. - * We return 0, so that bugs are more apparent. - */ -#define SETUP_RETL -#define RETL_INSN clr %o0 - -#else - -/* libc */ - -#include "DEFS.h" - -#define FASTER_REVERSE -#define FASTER_NONALIGNED -#define FASTER_ALIGNED - -#define SETUP_RETL mov %o0, %g6 -#define RETL_INSN mov %g6, %o0 - -#endif - /* Both these macros have to start with exactly the same insn */ #define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ ldd [%src + (offset) + 0x00], %t0; \ @@ -164,30 +136,6 @@ x: .text .align 4 -#ifdef FASTER_REVERSE - -70: /* rdword_align */ - - andcc %o1, 1, %g0 - be 4f - andcc %o1, 2, %g0 - - ldub [%o1 - 1], %g2 - sub %o1, 1, %o1 - stb %g2, [%o0 - 1] - sub %o2, 1, %o2 - be 3f - sub %o0, 1, %o0 -4: - lduh [%o1 - 2], %g2 - sub %o1, 2, %o1 - sth %g2, [%o0 - 2] - sub %o2, 2, %o2 - b 3f - sub %o0, 2, %o0 - -#endif /* FASTER_REVERSE */ - 0: retl nop ! Only bcopy returns here and it retuns void... @@ -198,7 +146,7 @@ FUNC(__memmove) #endif FUNC(memmove) cmp %o0, %o1 - SETUP_RETL + mov %o0, %g7 bleu 9f sub %o0, %o1, %o4 @@ -207,8 +155,6 @@ FUNC(memmove) bleu 0f andcc %o4, 3, %o5 -#ifndef FASTER_REVERSE - add %o1, %o2, %o1 add %o0, %o2, %o0 sub %o1, 1, %o1 @@ -224,295 +170,7 @@ FUNC(memmove) sub %o0, 1, %o0 retl - RETL_INSN - -#else /* FASTER_REVERSE */ - - add %o1, %o2, %o1 - add %o0, %o2, %o0 - bne 77f - cmp %o2, 15 - bleu 91f - andcc %o1, 3, %g0 - bne 70b -3: - andcc %o1, 4, %g0 - - be 2f - mov %o2, %g1 - - ld [%o1 - 4], %o4 - sub %g1, 4, %g1 - st %o4, [%o0 - 4] - sub %o1, 4, %o1 - sub %o0, 4, %o0 -2: - andcc %g1, 0xffffff80, %g7 - be 3f - andcc %o0, 4, %g0 - - be 74f + 4 -5: - RMOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) - RMOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) - RMOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) - RMOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) - subcc %g7, 128, %g7 - sub %o1, 128, %o1 - bne 5b - sub %o0, 128, %o0 -3: - andcc %g1, 0x70, %g7 - be 72f - andcc %g1, 8, %g0 - - sethi %hi(72f), %o5 - srl %g7, 1, %o4 - add %g7, %o4, %o4 - sub %o1, %g7, %o1 - sub %o5, %o4, %o5 - jmpl %o5 + %lo(72f), %g0 - sub %o0, %g7, %o0 - -71: /* rmemcpy_table */ - RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5) - RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5) - RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5) - RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5) - RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5) - RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5) - RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5) - -72: /* rmemcpy_table_end */ - - be 73f - andcc %g1, 4, %g0 - - ldd [%o1 - 0x08], %g2 - sub %o0, 8, %o0 - sub %o1, 8, %o1 - st %g2, [%o0] - st %g3, [%o0 + 0x04] - -73: /* rmemcpy_last7 */ - - be 1f - andcc %g1, 2, %g0 - - ld [%o1 - 4], %g2 - sub %o1, 4, %o1 - st %g2, [%o0 - 4] - sub %o0, 4, %o0 -1: - be 1f - andcc %g1, 1, %g0 - - lduh [%o1 - 2], %g2 - sub %o1, 2, %o1 - sth %g2, [%o0 - 2] - sub %o0, 2, %o0 -1: - be 1f - nop - - ldub [%o1 - 1], %g2 - stb %g2, [%o0 - 1] -1: - retl - RETL_INSN - -74: /* rldd_std */ - RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) - RMOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) - RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) - RMOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) - subcc %g7, 128, %g7 - sub %o1, 128, %o1 - bne 74b - sub %o0, 128, %o0 - - andcc %g1, 0x70, %g7 - be 72b - andcc %g1, 8, %g0 - - sethi %hi(72b), %o5 - srl %g7, 1, %o4 - add %g7, %o4, %o4 - sub %o1, %g7, %o1 - sub %o5, %o4, %o5 - jmpl %o5 + %lo(72b), %g0 - sub %o0, %g7, %o0 - -75: /* rshort_end */ - - and %o2, 0xe, %o3 -2: - sethi %hi(76f), %o5 - sll %o3, 3, %o4 - sub %o0, %o3, %o0 - sub %o5, %o4, %o5 - sub %o1, %o3, %o1 - jmpl %o5 + %lo(76f), %g0 - andcc %o2, 1, %g0 - - RMOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3) - RMOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3) - RMOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3) - RMOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3) - RMOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3) - RMOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3) - RMOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3) - -76: /* rshort_table_end */ - - be 1f - nop - ldub [%o1 - 1], %g2 - stb %g2, [%o0 - 1] -1: - retl - RETL_INSN - -91: /* rshort_aligned_end */ - - bne 75b - andcc %o2, 8, %g0 - - be 1f - andcc %o2, 4, %g0 - - ld [%o1 - 0x08], %g2 - ld [%o1 - 0x04], %g3 - sub %o1, 8, %o1 - st %g2, [%o0 - 0x08] - st %g3, [%o0 - 0x04] - sub %o0, 8, %o0 -1: - b 73b - mov %o2, %g1 - -77: /* rnon_aligned */ - cmp %o2, 15 - bleu 75b - andcc %o0, 3, %g0 - be 64f - andcc %o0, 1, %g0 - be 63f - andcc %o0, 2, %g0 - ldub [%o1 - 1], %g5 - sub %o1, 1, %o1 - stb %g5, [%o0 - 1] - sub %o0, 1, %o0 - be 64f - sub %o2, 1, %o2 -63: - ldub [%o1 - 1], %g5 - sub %o1, 2, %o1 - stb %g5, [%o0 - 1] - sub %o0, 2, %o0 - ldub [%o1], %g5 - sub %o2, 2, %o2 - stb %g5, [%o0] -64: - and %o1, 3, %g2 - and %o1, -4, %o1 - and %o2, 0xc, %g3 - add %o1, 4, %o1 - cmp %g3, 4 - sll %g2, 3, %g4 - mov 32, %g2 - be 4f - sub %g2, %g4, %g7 - - blu 3f - cmp %g3, 8 - - be 2f - srl %o2, 2, %g3 - - ld [%o1 - 4], %o3 - add %o0, -8, %o0 - ld [%o1 - 8], %o4 - add %o1, -16, %o1 - b 7f - add %g3, 1, %g3 -2: - ld [%o1 - 4], %o4 - add %o0, -4, %o0 - ld [%o1 - 8], %g1 - add %o1, -12, %o1 - b 8f - add %g3, 2, %g3 -3: - ld [%o1 - 4], %o5 - add %o0, -12, %o0 - ld [%o1 - 8], %o3 - add %o1, -20, %o1 - b 6f - srl %o2, 2, %g3 -4: - ld [%o1 - 4], %g1 - srl %o2, 2, %g3 - ld [%o1 - 8], %o5 - add %o1, -24, %o1 - add %o0, -16, %o0 - add %g3, -1, %g3 - - ld [%o1 + 12], %o3 -5: - sll %o5, %g4, %g2 - srl %g1, %g7, %g5 - or %g2, %g5, %g2 - st %g2, [%o0 + 12] -6: - ld [%o1 + 8], %o4 - sll %o3, %g4, %g2 - srl %o5, %g7, %g5 - or %g2, %g5, %g2 - st %g2, [%o0 + 8] -7: - ld [%o1 + 4], %g1 - sll %o4, %g4, %g2 - srl %o3, %g7, %g5 - or %g2, %g5, %g2 - st %g2, [%o0 + 4] -8: - ld [%o1], %o5 - sll %g1, %g4, %g2 - srl %o4, %g7, %g5 - addcc %g3, -4, %g3 - or %g2, %g5, %g2 - add %o1, -16, %o1 - st %g2, [%o0] - add %o0, -16, %o0 - bne,a 5b - ld [%o1 + 12], %o3 - sll %o5, %g4, %g2 - srl %g1, %g7, %g5 - srl %g4, 3, %g3 - or %g2, %g5, %g2 - add %o1, %g3, %o1 - andcc %o2, 2, %g0 - st %g2, [%o0 + 12] - be 1f - andcc %o2, 1, %g0 - - ldub [%o1 + 15], %g5 - add %o1, -2, %o1 - stb %g5, [%o0 + 11] - add %o0, -2, %o0 - ldub [%o1 + 16], %g5 - stb %g5, [%o0 + 12] -1: - be 1f - nop - ldub [%o1 + 15], %g5 - stb %g5, [%o0 + 11] -1: - retl - RETL_INSN - -#endif /* FASTER_REVERSE */ + mov %g7, %o0 /* NOTE: This code is executed just for the cases, where %src (=%o1) & 3 is != 0. @@ -543,13 +201,10 @@ FUNC(memmove) b 3f add %o0, 2, %o0 -#ifdef __KERNEL__ -FUNC(__memcpy) -#endif FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ sub %o0, %o1, %o4 - SETUP_RETL + mov %o0, %g7 9: andcc %o4, 3, %o5 0: @@ -572,7 +227,7 @@ FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ add %o1, 4, %o1 add %o0, 4, %o0 2: - andcc %g1, 0xffffff80, %g7 + andcc %g1, 0xffffff80, %g0 be 3f andcc %o0, 4, %g0 @@ -582,22 +237,23 @@ FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) - subcc %g7, 128, %g7 + sub %g1, 128, %g1 add %o1, 128, %o1 - bne 5b + cmp %g1, 128 + bge 5b add %o0, 128, %o0 3: - andcc %g1, 0x70, %g7 + andcc %g1, 0x70, %g4 be 80f andcc %g1, 8, %g0 sethi %hi(80f), %o5 - srl %g7, 1, %o4 - add %g7, %o4, %o4 - add %o1, %g7, %o1 + srl %g4, 1, %o4 + add %g4, %o4, %o4 + add %o1, %g4, %o1 sub %o5, %o4, %o5 jmpl %o5 + %lo(80f), %g0 - add %o0, %g7, %o0 + add %o0, %g4, %o0 79: /* memcpy_table */ @@ -644,43 +300,28 @@ FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ stb %g2, [%o0] 1: retl - RETL_INSN + mov %g7, %o0 82: /* ldd_std */ MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) - subcc %g7, 128, %g7 + subcc %g1, 128, %g1 add %o1, 128, %o1 - bne 82b + cmp %g1, 128 + bge 82b add %o0, 128, %o0 -#ifndef FASTER_ALIGNED - - andcc %g1, 0x70, %g7 - be 80b - andcc %g1, 8, %g0 - - sethi %hi(80b), %o5 - srl %g7, 1, %o4 - add %g7, %o4, %o4 - add %o1, %g7, %o1 - sub %o5, %o4, %o5 - jmpl %o5 + %lo(80b), %g0 - add %o0, %g7, %o0 - -#else /* FASTER_ALIGNED */ - - andcc %g1, 0x70, %g7 + andcc %g1, 0x70, %g4 be 84f andcc %g1, 8, %g0 sethi %hi(84f), %o5 - add %o1, %g7, %o1 - sub %o5, %g7, %o5 + add %o1, %g4, %o1 + sub %o5, %g4, %o5 jmpl %o5 + %lo(84f), %g0 - add %o0, %g7, %o0 + add %o0, %g4, %o0 83: /* amemcpy_table */ @@ -724,382 +365,132 @@ FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ stb %g2, [%o0] 1: retl - RETL_INSN - -#endif /* FASTER_ALIGNED */ + mov %g7, %o0 86: /* non_aligned */ cmp %o2, 6 bleu 88f + nop -#ifdef FASTER_NONALIGNED - - cmp %o2, 256 - bcc 87f - -#endif /* FASTER_NONALIGNED */ - - andcc %o0, 3, %g0 + save %sp, -96, %sp + andcc %i0, 3, %g0 be 61f - andcc %o0, 1, %g0 + andcc %i0, 1, %g0 be 60f - andcc %o0, 2, %g0 + andcc %i0, 2, %g0 - ldub [%o1], %g5 - add %o1, 1, %o1 - stb %g5, [%o0] - sub %o2, 1, %o2 + ldub [%i1], %g5 + add %i1, 1, %i1 + stb %g5, [%i0] + sub %i2, 1, %i2 bne 61f - add %o0, 1, %o0 + add %i0, 1, %i0 60: - ldub [%o1], %g3 - add %o1, 2, %o1 - stb %g3, [%o0] - sub %o2, 2, %o2 - ldub [%o1 - 1], %g3 - add %o0, 2, %o0 - stb %g3, [%o0 - 1] + ldub [%i1], %g3 + add %i1, 2, %i1 + stb %g3, [%i0] + sub %i2, 2, %i2 + ldub [%i1 - 1], %g3 + add %i0, 2, %i0 + stb %g3, [%i0 - 1] 61: - and %o1, 3, %g2 - and %o2, 0xc, %g3 - and %o1, -4, %o1 + and %i1, 3, %g2 + and %i2, 0xc, %g3 + and %i1, -4, %i1 cmp %g3, 4 sll %g2, 3, %g4 mov 32, %g2 be 4f - sub %g2, %g4, %g7 + sub %g2, %g4, %l0 blu 3f cmp %g3, 0x8 be 2f - srl %o2, 2, %g3 + srl %i2, 2, %g3 - ld [%o1], %o3 - add %o0, -8, %o0 - ld [%o1 + 4], %o4 + ld [%i1], %i3 + add %i0, -8, %i0 + ld [%i1 + 4], %i4 b 8f add %g3, 1, %g3 2: - ld [%o1], %o4 - add %o0, -12, %o0 - ld [%o1 + 4], %o5 + ld [%i1], %i4 + add %i0, -12, %i0 + ld [%i1 + 4], %i5 add %g3, 2, %g3 b 9f - add %o1, -4, %o1 + add %i1, -4, %i1 3: - ld [%o1], %g1 - add %o0, -4, %o0 - ld [%o1 + 4], %o3 - srl %o2, 2, %g3 + ld [%i1], %g1 + add %i0, -4, %i0 + ld [%i1 + 4], %i3 + srl %i2, 2, %g3 b 7f - add %o1, 4, %o1 + add %i1, 4, %i1 4: - ld [%o1], %o5 - cmp %o2, 7 - ld [%o1 + 4], %g1 - srl %o2, 2, %g3 + ld [%i1], %i5 + cmp %i2, 7 + ld [%i1 + 4], %g1 + srl %i2, 2, %g3 bleu 10f - add %o1, 8, %o1 + add %i1, 8, %i1 - ld [%o1], %o3 + ld [%i1], %i3 add %g3, -1, %g3 5: - sll %o5, %g4, %g2 - srl %g1, %g7, %g5 + sll %i5, %g4, %g2 + srl %g1, %l0, %g5 or %g2, %g5, %g2 - st %g2, [%o0] + st %g2, [%i0] 7: - ld [%o1 + 4], %o4 + ld [%i1 + 4], %i4 sll %g1, %g4, %g2 - srl %o3, %g7, %g5 + srl %i3, %l0, %g5 or %g2, %g5, %g2 - st %g2, [%o0 + 4] + st %g2, [%i0 + 4] 8: - ld [%o1 + 8], %o5 - sll %o3, %g4, %g2 - srl %o4, %g7, %g5 + ld [%i1 + 8], %i5 + sll %i3, %g4, %g2 + srl %i4, %l0, %g5 or %g2, %g5, %g2 - st %g2, [%o0 + 8] + st %g2, [%i0 + 8] 9: - ld [%o1 + 12], %g1 - sll %o4, %g4, %g2 - srl %o5, %g7, %g5 + ld [%i1 + 12], %g1 + sll %i4, %g4, %g2 + srl %i5, %l0, %g5 addcc %g3, -4, %g3 or %g2, %g5, %g2 - add %o1, 16, %o1 - st %g2, [%o0 + 12] - add %o0, 16, %o0 + add %i1, 16, %i1 + st %g2, [%i0 + 12] + add %i0, 16, %i0 bne,a 5b - ld [%o1], %o3 + ld [%i1], %i3 10: - sll %o5, %g4, %g2 - srl %g1, %g7, %g5 - srl %g7, 3, %g3 + sll %i5, %g4, %g2 + srl %g1, %l0, %g5 + srl %l0, 3, %g3 or %g2, %g5, %g2 - sub %o1, %g3, %o1 - andcc %o2, 2, %g0 - st %g2, [%o0] + sub %i1, %g3, %i1 + andcc %i2, 2, %g0 + st %g2, [%i0] be 1f - andcc %o2, 1, %g0 - - ldub [%o1], %g2 - add %o1, 2, %o1 - stb %g2, [%o0 + 4] - add %o0, 2, %o0 - ldub [%o1 - 1], %g2 - stb %g2, [%o0 + 3] + andcc %i2, 1, %g0 + + ldub [%i1], %g2 + add %i1, 2, %i1 + stb %g2, [%i0 + 4] + add %i0, 2, %i0 + ldub [%i1 - 1], %g2 + stb %g2, [%i0 + 3] 1: be 1f nop - ldub [%o1], %g2 - stb %g2, [%o0 + 4] + ldub [%i1], %g2 + stb %g2, [%i0 + 4] 1: - retl - RETL_INSN - -#ifdef FASTER_NONALIGNED - -87: /* faster_nonaligned */ - - andcc %o1, 3, %g0 - be 3f - andcc %o1, 1, %g0 - - be 4f - andcc %o1, 2, %g0 - - ldub [%o1], %g2 - add %o1, 1, %o1 - stb %g2, [%o0] - sub %o2, 1, %o2 - bne 3f - add %o0, 1, %o0 -4: - lduh [%o1], %g2 - add %o1, 2, %o1 - srl %g2, 8, %g3 - sub %o2, 2, %o2 - stb %g3, [%o0] - add %o0, 2, %o0 - stb %g2, [%o0 - 1] -3: - andcc %o1, 4, %g0 - - bne 2f - cmp %o5, 1 - - ld [%o1], %o4 - srl %o4, 24, %g2 - stb %g2, [%o0] - srl %o4, 16, %g3 - stb %g3, [%o0 + 1] - srl %o4, 8, %g2 - stb %g2, [%o0 + 2] - sub %o2, 4, %o2 - stb %o4, [%o0 + 3] - add %o1, 4, %o1 - add %o0, 4, %o0 -2: - be 33f - cmp %o5, 2 - be 32f - sub %o2, 4, %o2 -31: - ld [%o1], %g2 - add %o1, 4, %o1 - srl %g2, 24, %g3 - and %o0, 7, %g5 - stb %g3, [%o0] - cmp %g5, 7 - sll %g2, 8, %g1 - add %o0, 4, %o0 - be 41f - and %o2, 0xffffffc0, %o3 - ld [%o0 - 7], %o4 -4: - SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3) - SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3) - SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3) - SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3) - subcc %o3, 64, %o3 - add %o1, 64, %o1 - bne 4b - add %o0, 64, %o0 - - andcc %o2, 0x30, %o3 - be,a 1f - srl %g1, 16, %g2 -4: - SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3) - subcc %o3, 16, %o3 - add %o1, 16, %o1 - bne 4b - add %o0, 16, %o0 - - srl %g1, 16, %g2 -1: - st %o4, [%o0 - 7] - sth %g2, [%o0 - 3] - srl %g1, 8, %g4 - b 88f - stb %g4, [%o0 - 1] -32: - ld [%o1], %g2 - add %o1, 4, %o1 - srl %g2, 16, %g3 - and %o0, 7, %g5 - sth %g3, [%o0] - cmp %g5, 6 - sll %g2, 16, %g1 - add %o0, 4, %o0 - be 42f - and %o2, 0xffffffc0, %o3 - ld [%o0 - 6], %o4 -4: - SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2) - SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2) - SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2) - SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2) - subcc %o3, 64, %o3 - add %o1, 64, %o1 - bne 4b - add %o0, 64, %o0 - - andcc %o2, 0x30, %o3 - be,a 1f - srl %g1, 16, %g2 -4: - SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2) - subcc %o3, 16, %o3 - add %o1, 16, %o1 - bne 4b - add %o0, 16, %o0 - - srl %g1, 16, %g2 -1: - st %o4, [%o0 - 6] - b 88f - sth %g2, [%o0 - 2] -33: - ld [%o1], %g2 - sub %o2, 4, %o2 - srl %g2, 24, %g3 - and %o0, 7, %g5 - stb %g3, [%o0] - cmp %g5, 5 - srl %g2, 8, %g4 - sll %g2, 24, %g1 - sth %g4, [%o0 + 1] - add %o1, 4, %o1 - be 43f - and %o2, 0xffffffc0, %o3 - - ld [%o0 - 1], %o4 - add %o0, 4, %o0 -4: - SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1) - SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1) - SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1) - SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1) - subcc %o3, 64, %o3 - add %o1, 64, %o1 - bne 4b - add %o0, 64, %o0 - - andcc %o2, 0x30, %o3 - be,a 1f - srl %g1, 24, %g2 -4: - SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1) - subcc %o3, 16, %o3 - add %o1, 16, %o1 - bne 4b - add %o0, 16, %o0 - - srl %g1, 24, %g2 -1: - st %o4, [%o0 - 5] - b 88f - stb %g2, [%o0 - 1] -41: - SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3) - SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3) - SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3) - SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3) - subcc %o3, 64, %o3 - add %o1, 64, %o1 - bne 41b - add %o0, 64, %o0 - - andcc %o2, 0x30, %o3 - be,a 1f - srl %g1, 16, %g2 -4: - SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3) - subcc %o3, 16, %o3 - add %o1, 16, %o1 - bne 4b - add %o0, 16, %o0 - - srl %g1, 16, %g2 -1: - sth %g2, [%o0 - 3] - srl %g1, 8, %g4 - b 88f - stb %g4, [%o0 - 1] -43: - SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3) - SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3) - SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3) - SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3) - subcc %o3, 64, %o3 - add %o1, 64, %o1 - bne 43b - add %o0, 64, %o0 - - andcc %o2, 0x30, %o3 - be,a 1f - srl %g1, 24, %g2 -4: - SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3) - subcc %o3, 16, %o3 - add %o1, 16, %o1 - bne 4b - add %o0, 16, %o0 - - srl %g1, 24, %g2 -1: - stb %g2, [%o0 + 3] - b 88f - add %o0, 4, %o0 -42: - SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2) - SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2) - SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2) - SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2) - subcc %o3, 64, %o3 - add %o1, 64, %o1 - bne 42b - add %o0, 64, %o0 - - andcc %o2, 0x30, %o3 - be,a 1f - srl %g1, 16, %g2 -4: - SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2) - subcc %o3, 16, %o3 - add %o1, 16, %o1 - bne 4b - add %o0, 16, %o0 - - srl %g1, 16, %g2 -1: - sth %g2, [%o0 - 2] - - /* Fall through */ - -#endif /* FASTER_NONALIGNED */ + ret + restore %g7, %g0, %o0 88: /* short_end */ @@ -1130,7 +521,7 @@ FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ stb %g2, [%o0] 1: retl - RETL_INSN + mov %g7, %o0 90: /* short_aligned_end */ bne 88b diff --git a/arch/sparc/lib/memmove.S b/arch/sparc/lib/memmove.S new file mode 100644 index 00000000000..b7f6334e159 --- /dev/null +++ b/arch/sparc/lib/memmove.S @@ -0,0 +1,30 @@ +/* memmove.S: Simple memmove implementation. + * + * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com) + * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz) + */ + +#include <linux/linkage.h> + + .text +ENTRY(memmove) /* o0=dst o1=src o2=len */ + mov %o0, %g1 + cmp %o0, %o1 + bleu,pt %xcc, memcpy + add %o1, %o2, %g7 + cmp %g7, %o0 + bleu,pt %xcc, memcpy + add %o0, %o2, %o5 + sub %g7, 1, %o1 + + sub %o5, 1, %o0 +1: ldub [%o1], %g7 + subcc %o2, 1, %o2 + sub %o1, 1, %o1 + stb %g7, [%o0] + bne,pt %icc, 1b + sub %o0, 1, %o0 + + retl + mov %g1, %o0 +ENDPROC(memmove) diff --git a/arch/sparc/lib/memscan.S b/arch/sparc/lib/memscan_32.S index 28e78ff090a..4ff1657dfc2 100644 --- a/arch/sparc/lib/memscan.S +++ b/arch/sparc/lib/memscan_32.S @@ -1,4 +1,4 @@ -/* $Id: memscan.S,v 1.4 1996/09/08 02:01:20 davem Exp $ +/* * memscan.S: Optimized memscan for the Sparc. * * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) diff --git a/arch/sparc/lib/memscan_64.S b/arch/sparc/lib/memscan_64.S new file mode 100644 index 00000000000..5686dfa5dc1 --- /dev/null +++ b/arch/sparc/lib/memscan_64.S @@ -0,0 +1,129 @@ +/* + * memscan.S: Optimized memscan for Sparc64. + * + * Copyright (C) 1997,1998 Jakub Jelinek (jj@ultra.linux.cz) + * Copyright (C) 1998 David S. Miller (davem@redhat.com) + */ + +#define HI_MAGIC 0x8080808080808080 +#define LO_MAGIC 0x0101010101010101 +#define ASI_PL 0x88 + + .text + .align 32 + .globl __memscan_zero, __memscan_generic + .globl memscan + +__memscan_zero: + /* %o0 = bufp, %o1 = size */ + brlez,pn %o1, szzero + andcc %o0, 7, %g0 + be,pt %icc, we_are_aligned + sethi %hi(HI_MAGIC), %o4 + ldub [%o0], %o5 +1: subcc %o1, 1, %o1 + brz,pn %o5, 10f + add %o0, 1, %o0 + + be,pn %xcc, szzero + andcc %o0, 7, %g0 + bne,a,pn %icc, 1b + ldub [%o0], %o5 +we_are_aligned: + ldxa [%o0] ASI_PL, %o5 + or %o4, %lo(HI_MAGIC), %o3 + sllx %o3, 32, %o4 + or %o4, %o3, %o3 + + srlx %o3, 7, %o2 +msloop: + sub %o1, 8, %o1 + add %o0, 8, %o0 + sub %o5, %o2, %o4 + xor %o4, %o5, %o4 + andcc %o4, %o3, %g3 + bne,pn %xcc, check_bytes + srlx %o4, 32, %g3 + + brgz,a,pt %o1, msloop + ldxa [%o0] ASI_PL, %o5 +check_bytes: + bne,a,pn %icc, 2f + andcc %o5, 0xff, %g0 + add %o0, -5, %g2 + ba,pt %xcc, 3f + srlx %o5, 32, %g7 + +2: srlx %o5, 8, %g7 + be,pn %icc, 1f + add %o0, -8, %g2 + andcc %g7, 0xff, %g0 + srlx %g7, 8, %g7 + be,pn %icc, 1f + inc %g2 + andcc %g7, 0xff, %g0 + + srlx %g7, 8, %g7 + be,pn %icc, 1f + inc %g2 + andcc %g7, 0xff, %g0 + srlx %g7, 8, %g7 + be,pn %icc, 1f + inc %g2 + andcc %g3, %o3, %g0 + + be,a,pn %icc, 2f + mov %o0, %g2 +3: andcc %g7, 0xff, %g0 + srlx %g7, 8, %g7 + be,pn %icc, 1f + inc %g2 + andcc %g7, 0xff, %g0 + srlx %g7, 8, %g7 + + be,pn %icc, 1f + inc %g2 + andcc %g7, 0xff, %g0 + srlx %g7, 8, %g7 + be,pn %icc, 1f + inc %g2 + andcc %g7, 0xff, %g0 + srlx %g7, 8, %g7 + + be,pn %icc, 1f + inc %g2 +2: brgz,a,pt %o1, msloop + ldxa [%o0] ASI_PL, %o5 + inc %g2 +1: add %o0, %o1, %o0 + cmp %g2, %o0 + retl + + movle %xcc, %g2, %o0 +10: retl + sub %o0, 1, %o0 +szzero: retl + nop + +memscan: +__memscan_generic: + /* %o0 = addr, %o1 = c, %o2 = size */ + brz,pn %o2, 3f + add %o0, %o2, %o3 + ldub [%o0], %o5 + sub %g0, %o2, %o4 +1: + cmp %o5, %o1 + be,pn %icc, 2f + addcc %o4, 1, %o4 + bne,a,pt %xcc, 1b + ldub [%o3 + %o4], %o5 + retl + /* The delay slot is the same as the next insn, this is just to make it look more awful */ +2: + add %o3, %o4, %o0 + retl + sub %o0, 1, %o0 +3: + retl + nop diff --git a/arch/sparc/lib/memset.S b/arch/sparc/lib/memset.S index 1c37ea892de..99c017be871 100644 --- a/arch/sparc/lib/memset.S +++ b/arch/sparc/lib/memset.S @@ -60,11 +60,10 @@ .globl __bzero_begin __bzero_begin: - .globl __bzero, __memset, + .globl __bzero .globl memset .globl __memset_start, __memset_end __memset_start: -__memset: memset: and %o1, 0xff, %g3 sll %g3, 8, %g2 diff --git a/arch/sparc/lib/mul.S b/arch/sparc/lib/mul.S deleted file mode 100644 index da693560d87..00000000000 --- a/arch/sparc/lib/mul.S +++ /dev/null @@ -1,137 +0,0 @@ -/* $Id: mul.S,v 1.4 1996/09/30 02:22:32 davem Exp $ - * mul.S: This routine was taken from glibc-1.09 and is covered - * by the GNU Library General Public License Version 2. - */ - -/* - * Signed multiply, from Appendix E of the Sparc Version 8 - * Architecture Manual. - */ - -/* - * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of - * the 64-bit product). - * - * This code optimizes short (less than 13-bit) multiplies. - */ - - .globl .mul - .globl _Mul -.mul: -_Mul: /* needed for export */ - mov %o0, %y ! multiplier -> Y - andncc %o0, 0xfff, %g0 ! test bits 12..31 - be Lmul_shortway ! if zero, can do it the short way - andcc %g0, %g0, %o4 ! zero the partial product and clear N and V - - /* - * Long multiply. 32 steps, followed by a final shift step. - */ - mulscc %o4, %o1, %o4 ! 1 - mulscc %o4, %o1, %o4 ! 2 - mulscc %o4, %o1, %o4 ! 3 - mulscc %o4, %o1, %o4 ! 4 - mulscc %o4, %o1, %o4 ! 5 - mulscc %o4, %o1, %o4 ! 6 - mulscc %o4, %o1, %o4 ! 7 - mulscc %o4, %o1, %o4 ! 8 - mulscc %o4, %o1, %o4 ! 9 - mulscc %o4, %o1, %o4 ! 10 - mulscc %o4, %o1, %o4 ! 11 - mulscc %o4, %o1, %o4 ! 12 - mulscc %o4, %o1, %o4 ! 13 - mulscc %o4, %o1, %o4 ! 14 - mulscc %o4, %o1, %o4 ! 15 - mulscc %o4, %o1, %o4 ! 16 - mulscc %o4, %o1, %o4 ! 17 - mulscc %o4, %o1, %o4 ! 18 - mulscc %o4, %o1, %o4 ! 19 - mulscc %o4, %o1, %o4 ! 20 - mulscc %o4, %o1, %o4 ! 21 - mulscc %o4, %o1, %o4 ! 22 - mulscc %o4, %o1, %o4 ! 23 - mulscc %o4, %o1, %o4 ! 24 - mulscc %o4, %o1, %o4 ! 25 - mulscc %o4, %o1, %o4 ! 26 - mulscc %o4, %o1, %o4 ! 27 - mulscc %o4, %o1, %o4 ! 28 - mulscc %o4, %o1, %o4 ! 29 - mulscc %o4, %o1, %o4 ! 30 - mulscc %o4, %o1, %o4 ! 31 - mulscc %o4, %o1, %o4 ! 32 - mulscc %o4, %g0, %o4 ! final shift - - ! If %o0 was negative, the result is - ! (%o0 * %o1) + (%o1 << 32)) - ! We fix that here. - -#if 0 - tst %o0 - bge 1f - rd %y, %o0 - - ! %o0 was indeed negative; fix upper 32 bits of result by subtracting - ! %o1 (i.e., return %o4 - %o1 in %o1). - retl - sub %o4, %o1, %o1 - -1: - retl - mov %o4, %o1 -#else - /* Faster code adapted from tege@sics.se's code for umul.S. */ - sra %o0, 31, %o2 ! make mask from sign bit - and %o1, %o2, %o2 ! %o2 = 0 or %o1, depending on sign of %o0 - rd %y, %o0 ! get lower half of product - retl - sub %o4, %o2, %o1 ! subtract compensation - ! and put upper half in place -#endif - -Lmul_shortway: - /* - * Short multiply. 12 steps, followed by a final shift step. - * The resulting bits are off by 12 and (32-12) = 20 bit positions, - * but there is no problem with %o0 being negative (unlike above). - */ - mulscc %o4, %o1, %o4 ! 1 - mulscc %o4, %o1, %o4 ! 2 - mulscc %o4, %o1, %o4 ! 3 - mulscc %o4, %o1, %o4 ! 4 - mulscc %o4, %o1, %o4 ! 5 - mulscc %o4, %o1, %o4 ! 6 - mulscc %o4, %o1, %o4 ! 7 - mulscc %o4, %o1, %o4 ! 8 - mulscc %o4, %o1, %o4 ! 9 - mulscc %o4, %o1, %o4 ! 10 - mulscc %o4, %o1, %o4 ! 11 - mulscc %o4, %o1, %o4 ! 12 - mulscc %o4, %g0, %o4 ! final shift - - /* - * %o4 has 20 of the bits that should be in the low part of the - * result; %y has the bottom 12 (as %y's top 12). That is: - * - * %o4 %y - * +----------------+----------------+ - * | -12- | -20- | -12- | -20- | - * +------(---------+------)---------+ - * --hi-- ----low-part---- - * - * The upper 12 bits of %o4 should be sign-extended to form the - * high part of the product (i.e., highpart = %o4 >> 20). - */ - - rd %y, %o5 - sll %o4, 12, %o0 ! shift middle bits left 12 - srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left - or %o5, %o0, %o0 ! construct low part of result - retl - sra %o4, 20, %o1 ! ... and extract high part of result - - .globl .mul_patch -.mul_patch: - smul %o0, %o1, %o0 - retl - rd %y, %o1 - nop diff --git a/arch/sparc/lib/muldi3.S b/arch/sparc/lib/muldi3.S index 7f17872d060..9794939d1c1 100644 --- a/arch/sparc/lib/muldi3.S +++ b/arch/sparc/lib/muldi3.S @@ -63,12 +63,12 @@ __muldi3: rd %y, %o1 mov %o1, %l3 mov %i1, %o0 - call .umul mov %i2, %o1 + umul %o0, %o1, %o0 mov %o0, %l0 mov %i0, %o0 - call .umul mov %i3, %o1 + umul %o0, %o1, %o0 add %l0, %o0, %l0 mov %l2, %i0 add %l2, %l0, %i0 diff --git a/arch/sparc/lib/rem.S b/arch/sparc/lib/rem.S deleted file mode 100644 index bf015a90d07..00000000000 --- a/arch/sparc/lib/rem.S +++ /dev/null @@ -1,384 +0,0 @@ -/* $Id: rem.S,v 1.7 1996/09/30 02:22:34 davem Exp $ - * rem.S: This routine was taken from glibc-1.09 and is covered - * by the GNU Library General Public License Version 2. - */ - - -/* This file is generated from divrem.m4; DO NOT EDIT! */ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * .rem name of function to generate - * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1 - * true true=true => signed; true=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top decade of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - - - .globl .rem - .globl _Rem -.rem: -_Rem: /* needed for export */ - ! compute sign of result; if neither is negative, no problem - orcc %o1, %o0, %g0 ! either negative? - bge 2f ! no, go do the divide - mov %o0, %g2 ! compute sign in any case - - tst %o1 - bge 1f - tst %o0 - ! %o1 is definitely negative; %o0 might also be negative - bge 2f ! if %o0 not negative... - sub %g0, %o1, %o1 ! in any case, make %o1 nonneg -1: ! %o0 is negative, %o1 is nonnegative - sub %g0, %o0, %o0 ! make %o0 nonnegative -2: - - ! Ready to divide. Compute size of quotient; scale comparand. - orcc %o1, %g0, %o5 - bne 1f - mov %o0, %o3 - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta ST_DIV0 - retl - clr %o0 - -1: - cmp %o3, %o5 ! if %o1 exceeds %o0, done - blu Lgot_result ! (and algorithm fails otherwise) - clr %o2 - - sethi %hi(1 << (32 - 4 - 1)), %g1 - - cmp %o3, %g1 - blu Lnot_really_big - clr %o4 - - ! Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R. - 1: - cmp %o5, %g1 - bgeu 3f - mov 1, %g7 - - sll %o5, 4, %o5 - - b 1b - add %o4, 1, %o4 - - ! Now compute %g7. - 2: - addcc %o5, %o5, %o5 - - bcc Lnot_too_big - add %g7, 1, %g7 - - ! We get here if the %o1 overflowed while shifting. - ! This means that %o3 has the high-order bit set. - ! Restore %o5 and subtract from %o3. - sll %g1, 4, %g1 ! high order bit - srl %o5, 1, %o5 ! rest of %o5 - add %o5, %g1, %o5 - - b Ldo_single_div - sub %g7, 1, %g7 - - Lnot_too_big: - 3: - cmp %o5, %o3 - blu 2b - nop - - be Ldo_single_div - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! %o5 > %o3: went too far: back up 1 step - ! srl %o5, 1, %o5 - ! dec %g7 - ! do single-bit divide steps - ! - ! We have to be careful here. We know that %o3 >= %o5, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - Ldo_single_div: - subcc %g7, 1, %g7 - bl Lend_regular_divide - nop - - sub %o3, %o5, %o3 - mov 1, %o2 - - b Lend_single_divloop - nop - Lsingle_divloop: - sll %o2, 1, %o2 - - bl 1f - srl %o5, 1, %o5 - ! %o3 >= 0 - sub %o3, %o5, %o3 - - b 2f - add %o2, 1, %o2 - 1: ! %o3 < 0 - add %o3, %o5, %o3 - sub %o2, 1, %o2 - 2: - Lend_single_divloop: - subcc %g7, 1, %g7 - bge Lsingle_divloop - tst %o3 - - b,a Lend_regular_divide - -Lnot_really_big: -1: - sll %o5, 4, %o5 - cmp %o5, %o3 - bleu 1b - addcc %o4, 1, %o4 - be Lgot_result - sub %o4, 1, %o4 - - tst %o3 ! set up for initial iteration -Ldivloop: - sll %o2, 4, %o2 - ! depth 1, accumulated bits 0 - bl L.1.16 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 2, accumulated bits 1 - bl L.2.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits 3 - bl L.3.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 7 - bl L.4.23 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - - b 9f - add %o2, (7*2+1), %o2 - -L.4.23: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - -L.3.19: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 5 - bl L.4.21 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - -L.4.21: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - -L.2.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits 1 - bl L.3.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 3 - bl L.4.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - -L.4.19: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - -L.3.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 1 - bl L.4.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - -L.4.17: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - -L.1.16: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 2, accumulated bits -1 - bl L.2.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits -1 - bl L.3.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -1 - bl L.4.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - -L.4.15: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - -L.3.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -3 - bl L.4.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - -L.4.13: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - -L.2.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits -3 - bl L.3.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -5 - bl L.4.11 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - -L.4.11: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - -L.3.13: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -7 - bl L.4.9 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - -L.4.9: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - 9: -Lend_regular_divide: - subcc %o4, 1, %o4 - bge Ldivloop - tst %o3 - - bl,a Lgot_result - ! non-restoring fixup here (one instruction only!) - add %o3, %o1, %o3 - -Lgot_result: - ! check to see if answer should be < 0 - tst %g2 - bl,a 1f - sub %g0, %o3, %o3 -1: - retl - mov %o3, %o0 - - .globl .rem_patch -.rem_patch: - sra %o0, 0x1f, %o4 - wr %o4, 0x0, %y - nop - nop - nop - sdivcc %o0, %o1, %o2 - bvs,a 1f - xnor %o2, %g0, %o2 -1: smul %o2, %o1, %o2 - retl - sub %o0, %o2, %o0 - nop diff --git a/arch/sparc/lib/rwsem.S b/arch/sparc/lib/rwsem.S deleted file mode 100644 index f406b1f2279..00000000000 --- a/arch/sparc/lib/rwsem.S +++ /dev/null @@ -1,204 +0,0 @@ -/* $Id: rwsem.S,v 1.5 2000/05/09 17:40:13 davem Exp $ - * Assembly part of rw semaphores. - * - * Copyright (C) 1999 Jakub Jelinek (jakub@redhat.com) - */ - -#include <asm/ptrace.h> -#include <asm/psr.h> - - .section .sched.text, "ax" - .align 4 - - .globl ___down_read -___down_read: - rd %psr, %g3 - nop - nop - nop - or %g3, PSR_PIL, %g7 - wr %g7, 0, %psr - nop - nop - nop -#ifdef CONFIG_SMP -1: ldstub [%g1 + 4], %g7 - tst %g7 - bne 1b - ld [%g1], %g7 - sub %g7, 1, %g7 - st %g7, [%g1] - stb %g0, [%g1 + 4] -#else - ld [%g1], %g7 - sub %g7, 1, %g7 - st %g7, [%g1] -#endif - wr %g3, 0, %psr - add %g7, 1, %g7 - nop - nop - subcc %g7, 1, %g7 - bneg 3f - nop -2: jmpl %o7, %g0 - mov %g4, %o7 -3: save %sp, -64, %sp - mov %g1, %l1 - mov %g4, %l4 - bcs 4f - mov %g5, %l5 - call down_read_failed - mov %l1, %o0 - mov %l1, %g1 - mov %l4, %g4 - ba ___down_read - restore %l5, %g0, %g5 -4: call down_read_failed_biased - mov %l1, %o0 - mov %l1, %g1 - mov %l4, %g4 - ba 2b - restore %l5, %g0, %g5 - - .globl ___down_write -___down_write: - rd %psr, %g3 - nop - nop - nop - or %g3, PSR_PIL, %g7 - wr %g7, 0, %psr - sethi %hi(0x01000000), %g2 - nop - nop -#ifdef CONFIG_SMP -1: ldstub [%g1 + 4], %g7 - tst %g7 - bne 1b - ld [%g1], %g7 - sub %g7, %g2, %g7 - st %g7, [%g1] - stb %g0, [%g1 + 4] -#else - ld [%g1], %g7 - sub %g7, %g2, %g7 - st %g7, [%g1] -#endif - wr %g3, 0, %psr - add %g7, %g2, %g7 - nop - nop - subcc %g7, %g2, %g7 - bne 3f - nop -2: jmpl %o7, %g0 - mov %g4, %o7 -3: save %sp, -64, %sp - mov %g1, %l1 - mov %g4, %l4 - bcs 4f - mov %g5, %l5 - call down_write_failed - mov %l1, %o0 - mov %l1, %g1 - mov %l4, %g4 - ba ___down_write - restore %l5, %g0, %g5 -4: call down_write_failed_biased - mov %l1, %o0 - mov %l1, %g1 - mov %l4, %g4 - ba 2b - restore %l5, %g0, %g5 - - .text - .globl ___up_read -___up_read: - rd %psr, %g3 - nop - nop - nop - or %g3, PSR_PIL, %g7 - wr %g7, 0, %psr - nop - nop - nop -#ifdef CONFIG_SMP -1: ldstub [%g1 + 4], %g7 - tst %g7 - bne 1b - ld [%g1], %g7 - add %g7, 1, %g7 - st %g7, [%g1] - stb %g0, [%g1 + 4] -#else - ld [%g1], %g7 - add %g7, 1, %g7 - st %g7, [%g1] -#endif - wr %g3, 0, %psr - nop - nop - nop - cmp %g7, 0 - be 3f - nop -2: jmpl %o7, %g0 - mov %g4, %o7 -3: save %sp, -64, %sp - mov %g1, %l1 - mov %g4, %l4 - mov %g5, %l5 - clr %o1 - call __rwsem_wake - mov %l1, %o0 - mov %l1, %g1 - mov %l4, %g4 - ba 2b - restore %l5, %g0, %g5 - - .globl ___up_write -___up_write: - rd %psr, %g3 - nop - nop - nop - or %g3, PSR_PIL, %g7 - wr %g7, 0, %psr - sethi %hi(0x01000000), %g2 - nop - nop -#ifdef CONFIG_SMP -1: ldstub [%g1 + 4], %g7 - tst %g7 - bne 1b - ld [%g1], %g7 - add %g7, %g2, %g7 - st %g7, [%g1] - stb %g0, [%g1 + 4] -#else - ld [%g1], %g7 - add %g7, %g2, %g7 - st %g7, [%g1] -#endif - wr %g3, 0, %psr - sub %g7, %g2, %g7 - nop - nop - addcc %g7, %g2, %g7 - bcs 3f - nop -2: jmpl %o7, %g0 - mov %g4, %o7 -3: save %sp, -64, %sp - mov %g1, %l1 - mov %g4, %l4 - mov %g5, %l5 - mov %g7, %o1 - call __rwsem_wake - mov %l1, %o0 - mov %l1, %g1 - mov %l4, %g4 - ba 2b - restore %l5, %g0, %g5 diff --git a/arch/sparc/lib/sdiv.S b/arch/sparc/lib/sdiv.S deleted file mode 100644 index af9451629d0..00000000000 --- a/arch/sparc/lib/sdiv.S +++ /dev/null @@ -1,381 +0,0 @@ -/* $Id: sdiv.S,v 1.6 1996/10/02 17:37:00 davem Exp $ - * sdiv.S: This routine was taken from glibc-1.09 and is covered - * by the GNU Library General Public License Version 2. - */ - - -/* This file is generated from divrem.m4; DO NOT EDIT! */ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * .div name of function to generate - * div div=div => %o0 / %o1; div=rem => %o0 % %o1 - * true true=true => signed; true=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top decade of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - - - .globl .div - .globl _Div -.div: -_Div: /* needed for export */ - ! compute sign of result; if neither is negative, no problem - orcc %o1, %o0, %g0 ! either negative? - bge 2f ! no, go do the divide - xor %o1, %o0, %g2 ! compute sign in any case - - tst %o1 - bge 1f - tst %o0 - ! %o1 is definitely negative; %o0 might also be negative - bge 2f ! if %o0 not negative... - sub %g0, %o1, %o1 ! in any case, make %o1 nonneg -1: ! %o0 is negative, %o1 is nonnegative - sub %g0, %o0, %o0 ! make %o0 nonnegative -2: - - ! Ready to divide. Compute size of quotient; scale comparand. - orcc %o1, %g0, %o5 - bne 1f - mov %o0, %o3 - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta ST_DIV0 - retl - clr %o0 - -1: - cmp %o3, %o5 ! if %o1 exceeds %o0, done - blu Lgot_result ! (and algorithm fails otherwise) - clr %o2 - - sethi %hi(1 << (32 - 4 - 1)), %g1 - - cmp %o3, %g1 - blu Lnot_really_big - clr %o4 - - ! Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R. - 1: - cmp %o5, %g1 - bgeu 3f - mov 1, %g7 - - sll %o5, 4, %o5 - - b 1b - add %o4, 1, %o4 - - ! Now compute %g7. - 2: - addcc %o5, %o5, %o5 - bcc Lnot_too_big - add %g7, 1, %g7 - - ! We get here if the %o1 overflowed while shifting. - ! This means that %o3 has the high-order bit set. - ! Restore %o5 and subtract from %o3. - sll %g1, 4, %g1 ! high order bit - srl %o5, 1, %o5 ! rest of %o5 - add %o5, %g1, %o5 - - b Ldo_single_div - sub %g7, 1, %g7 - - Lnot_too_big: - 3: - cmp %o5, %o3 - blu 2b - nop - - be Ldo_single_div - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! %o5 > %o3: went too far: back up 1 step - ! srl %o5, 1, %o5 - ! dec %g7 - ! do single-bit divide steps - ! - ! We have to be careful here. We know that %o3 >= %o5, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - Ldo_single_div: - subcc %g7, 1, %g7 - bl Lend_regular_divide - nop - - sub %o3, %o5, %o3 - mov 1, %o2 - - b Lend_single_divloop - nop - Lsingle_divloop: - sll %o2, 1, %o2 - - bl 1f - srl %o5, 1, %o5 - ! %o3 >= 0 - sub %o3, %o5, %o3 - - b 2f - add %o2, 1, %o2 - 1: ! %o3 < 0 - add %o3, %o5, %o3 - sub %o2, 1, %o2 - 2: - Lend_single_divloop: - subcc %g7, 1, %g7 - bge Lsingle_divloop - tst %o3 - - b,a Lend_regular_divide - -Lnot_really_big: -1: - sll %o5, 4, %o5 - cmp %o5, %o3 - bleu 1b - addcc %o4, 1, %o4 - - be Lgot_result - sub %o4, 1, %o4 - - tst %o3 ! set up for initial iteration -Ldivloop: - sll %o2, 4, %o2 - ! depth 1, accumulated bits 0 - bl L.1.16 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 2, accumulated bits 1 - bl L.2.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits 3 - bl L.3.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 7 - bl L.4.23 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - -L.4.23: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - -L.3.19: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 5 - bl L.4.21 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - -L.4.21: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - -L.2.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits 1 - bl L.3.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 3 - bl L.4.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - -L.4.19: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - - -L.3.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 1 - bl L.4.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - -L.4.17: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - -L.1.16: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 2, accumulated bits -1 - bl L.2.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits -1 - bl L.3.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -1 - bl L.4.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - -L.4.15: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - -L.3.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -3 - bl L.4.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - -L.4.13: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - -L.2.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits -3 - bl L.3.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -5 - bl L.4.11 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - -L.4.11: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - -L.3.13: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -7 - bl L.4.9 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - -L.4.9: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - 9: -Lend_regular_divide: - subcc %o4, 1, %o4 - bge Ldivloop - tst %o3 - - bl,a Lgot_result - ! non-restoring fixup here (one instruction only!) - sub %o2, 1, %o2 - -Lgot_result: - ! check to see if answer should be < 0 - tst %g2 - bl,a 1f - sub %g0, %o2, %o2 -1: - retl - mov %o2, %o0 - - .globl .div_patch -.div_patch: - sra %o0, 0x1f, %o2 - wr %o2, 0x0, %y - nop - nop - nop - sdivcc %o0, %o1, %o0 - bvs,a 1f - xnor %o0, %g0, %o0 -1: retl - nop diff --git a/arch/sparc/lib/strlen.S b/arch/sparc/lib/strlen.S index ed9a763368c..536f83507fb 100644 --- a/arch/sparc/lib/strlen.S +++ b/arch/sparc/lib/strlen.S @@ -1,51 +1,40 @@ /* strlen.S: Sparc optimized strlen code * Hand optimized from GNU libc's strlen * Copyright (C) 1991,1996 Free Software Foundation - * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) - * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1996,2008 David S. Miller (davem@davemloft.net) + * Copyright (C) 1996, 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) */ +#include <linux/linkage.h> +#include <asm/asm.h> + #define LO_MAGIC 0x01010101 #define HI_MAGIC 0x80808080 -0: + .text +ENTRY(strlen) + mov %o0, %o1 + andcc %o0, 3, %g0 + BRANCH32(be, pt, 9f) + sethi %hi(HI_MAGIC), %o4 ldub [%o0], %o5 - cmp %o5, 0 - be 1f + BRANCH_REG_ZERO(pn, %o5, 11f) add %o0, 1, %o0 andcc %o0, 3, %g0 - be 4f + BRANCH32(be, pn, 4f) or %o4, %lo(HI_MAGIC), %o3 ldub [%o0], %o5 - cmp %o5, 0 - be 2f + BRANCH_REG_ZERO(pn, %o5, 12f) add %o0, 1, %o0 andcc %o0, 3, %g0 - be 5f + BRANCH32(be, pt, 5f) sethi %hi(LO_MAGIC), %o4 ldub [%o0], %o5 - cmp %o5, 0 - be 3f + BRANCH_REG_ZERO(pn, %o5, 13f) add %o0, 1, %o0 - b 8f + BRANCH32(ba, pt, 8f) or %o4, %lo(LO_MAGIC), %o2 -1: - retl - mov 0, %o0 -2: - retl - mov 1, %o0 -3: - retl - mov 2, %o0 - - .align 4 - .global strlen -strlen: - mov %o0, %o1 - andcc %o0, 3, %g0 - bne 0b - sethi %hi(HI_MAGIC), %o4 +9: or %o4, %lo(HI_MAGIC), %o3 4: sethi %hi(LO_MAGIC), %o4 @@ -56,26 +45,36 @@ strlen: 2: sub %o5, %o2, %o4 andcc %o4, %o3, %g0 - be 8b + BRANCH32(be, pt, 8b) add %o0, 4, %o0 /* Check every byte. */ - srl %o5, 24, %g5 - andcc %g5, 0xff, %g0 - be 1f + srl %o5, 24, %g7 + andcc %g7, 0xff, %g0 + BRANCH32(be, pn, 1f) add %o0, -4, %o4 - srl %o5, 16, %g5 - andcc %g5, 0xff, %g0 - be 1f + srl %o5, 16, %g7 + andcc %g7, 0xff, %g0 + BRANCH32(be, pn, 1f) add %o4, 1, %o4 - srl %o5, 8, %g5 - andcc %g5, 0xff, %g0 - be 1f + srl %o5, 8, %g7 + andcc %g7, 0xff, %g0 + BRANCH32(be, pn, 1f) add %o4, 1, %o4 andcc %o5, 0xff, %g0 - bne,a 2b + BRANCH32_ANNUL(bne, pt, 2b) ld [%o0], %o5 add %o4, 1, %o4 1: retl sub %o4, %o1, %o0 +11: + retl + mov 0, %o0 +12: + retl + mov 1, %o0 +13: + retl + mov 2, %o0 +ENDPROC(strlen) diff --git a/arch/sparc/lib/strlen_user.S b/arch/sparc/lib/strlen_user.S deleted file mode 100644 index 8c8a371df3c..00000000000 --- a/arch/sparc/lib/strlen_user.S +++ /dev/null @@ -1,109 +0,0 @@ -/* strlen_user.S: Sparc optimized strlen_user code - * - * Return length of string in userspace including terminating 0 - * or 0 for error - * - * Copyright (C) 1991,1996 Free Software Foundation - * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) - * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz) - */ - -#define LO_MAGIC 0x01010101 -#define HI_MAGIC 0x80808080 - -10: - ldub [%o0], %o5 - cmp %o5, 0 - be 1f - add %o0, 1, %o0 - andcc %o0, 3, %g0 - be 4f - or %o4, %lo(HI_MAGIC), %o3 -11: - ldub [%o0], %o5 - cmp %o5, 0 - be 2f - add %o0, 1, %o0 - andcc %o0, 3, %g0 - be 5f - sethi %hi(LO_MAGIC), %o4 -12: - ldub [%o0], %o5 - cmp %o5, 0 - be 3f - add %o0, 1, %o0 - b 13f - or %o4, %lo(LO_MAGIC), %o2 -1: - retl - mov 1, %o0 -2: - retl - mov 2, %o0 -3: - retl - mov 3, %o0 - - .align 4 - .global __strlen_user, __strnlen_user -__strlen_user: - sethi %hi(32768), %o1 -__strnlen_user: - mov %o1, %g1 - mov %o0, %o1 - andcc %o0, 3, %g0 - bne 10b - sethi %hi(HI_MAGIC), %o4 - or %o4, %lo(HI_MAGIC), %o3 -4: - sethi %hi(LO_MAGIC), %o4 -5: - or %o4, %lo(LO_MAGIC), %o2 -13: - ld [%o0], %o5 -2: - sub %o5, %o2, %o4 - andcc %o4, %o3, %g0 - bne 82f - add %o0, 4, %o0 - sub %o0, %o1, %g2 -81: cmp %g2, %g1 - blu 13b - mov %o0, %o4 - ba,a 1f - - /* Check every byte. */ -82: srl %o5, 24, %g5 - andcc %g5, 0xff, %g0 - be 1f - add %o0, -3, %o4 - srl %o5, 16, %g5 - andcc %g5, 0xff, %g0 - be 1f - add %o4, 1, %o4 - srl %o5, 8, %g5 - andcc %g5, 0xff, %g0 - be 1f - add %o4, 1, %o4 - andcc %o5, 0xff, %g0 - bne 81b - sub %o0, %o1, %g2 - - add %o4, 1, %o4 -1: - retl - sub %o4, %o1, %o0 - - .section .fixup,#alloc,#execinstr - .align 4 -9: - retl - clr %o0 - - .section __ex_table,#alloc - .align 4 - - .word 10b, 9b - .word 11b, 9b - .word 12b, 9b - .word 13b, 9b diff --git a/arch/sparc/lib/strncmp.S b/arch/sparc/lib/strncmp_32.S index 615626805d4..c0d1b568c1c 100644 --- a/arch/sparc/lib/strncmp.S +++ b/arch/sparc/lib/strncmp_32.S @@ -1,13 +1,12 @@ -/* $Id: strncmp.S,v 1.2 1996/09/09 02:47:20 davem Exp $ +/* * strncmp.S: Hand optimized Sparc assembly of GCC output from GNU libc * generic strncmp routine. */ +#include <linux/linkage.h> + .text - .align 4 - .global __strncmp, strncmp -__strncmp: -strncmp: +ENTRY(strncmp) mov %o0, %g3 mov 0, %o3 @@ -116,3 +115,4 @@ strncmp: and %g2, 0xff, %o0 retl sub %o3, %o0, %o0 +ENDPROC(strncmp) diff --git a/arch/sparc/lib/strncmp_64.S b/arch/sparc/lib/strncmp_64.S new file mode 100644 index 00000000000..0656627166f --- /dev/null +++ b/arch/sparc/lib/strncmp_64.S @@ -0,0 +1,30 @@ +/* + * Sparc64 optimized strncmp code. + * + * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + */ + +#include <linux/linkage.h> +#include <asm/asi.h> + + .text +ENTRY(strncmp) + brlez,pn %o2, 3f + lduba [%o0] (ASI_PNF), %o3 +1: + add %o0, 1, %o0 + ldub [%o1], %o4 + brz,pn %o3, 2f + add %o1, 1, %o1 + cmp %o3, %o4 + bne,pn %icc, 2f + subcc %o2, 1, %o2 + bne,a,pt %xcc, 1b + ldub [%o0], %o3 +2: + retl + sub %o3, %o4, %o0 +3: + retl + clr %o0 +ENDPROC(strncmp) diff --git a/arch/sparc/lib/strncpy_from_user.S b/arch/sparc/lib/strncpy_from_user.S deleted file mode 100644 index d77198976a6..00000000000 --- a/arch/sparc/lib/strncpy_from_user.S +++ /dev/null @@ -1,47 +0,0 @@ -/* strncpy_from_user.S: Sparc strncpy from userspace. - * - * Copyright(C) 1996 David S. Miller - */ - -#include <asm/ptrace.h> -#include <asm/errno.h> - - .text - .align 4 - - /* Must return: - * - * -EFAULT for an exception - * count if we hit the buffer limit - * bytes copied if we hit a null byte - */ - - .globl __strncpy_from_user -__strncpy_from_user: - /* %o0=dest, %o1=src, %o2=count */ - mov %o2, %o3 -1: - subcc %o2, 1, %o2 - bneg 2f - nop -10: - ldub [%o1], %o4 - add %o0, 1, %o0 - cmp %o4, 0 - add %o1, 1, %o1 - bne 1b - stb %o4, [%o0 - 1] -2: - add %o2, 1, %o0 - retl - sub %o3, %o0, %o0 - - .section .fixup,#alloc,#execinstr - .align 4 -4: - retl - mov -EFAULT, %o0 - - .section __ex_table,#alloc - .align 4 - .word 10b, 4b diff --git a/arch/sparc/lib/ucmpdi2.c b/arch/sparc/lib/ucmpdi2.c new file mode 100644 index 00000000000..1e06ed50068 --- /dev/null +++ b/arch/sparc/lib/ucmpdi2.c @@ -0,0 +1,19 @@ +#include <linux/module.h> +#include "libgcc.h" + +word_type __ucmpdi2(unsigned long long a, unsigned long long b) +{ + const DWunion au = {.ll = a}; + const DWunion bu = {.ll = b}; + + if ((unsigned int) au.s.high < (unsigned int) bu.s.high) + return 0; + else if ((unsigned int) au.s.high > (unsigned int) bu.s.high) + return 2; + if ((unsigned int) au.s.low < (unsigned int) bu.s.low) + return 0; + else if ((unsigned int) au.s.low > (unsigned int) bu.s.low) + return 2; + return 1; +} +EXPORT_SYMBOL(__ucmpdi2); diff --git a/arch/sparc/lib/udiv.S b/arch/sparc/lib/udiv.S deleted file mode 100644 index 169e01da671..00000000000 --- a/arch/sparc/lib/udiv.S +++ /dev/null @@ -1,357 +0,0 @@ -/* $Id: udiv.S,v 1.4 1996/09/30 02:22:38 davem Exp $ - * udiv.S: This routine was taken from glibc-1.09 and is covered - * by the GNU Library General Public License Version 2. - */ - - -/* This file is generated from divrem.m4; DO NOT EDIT! */ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * .udiv name of function to generate - * div div=div => %o0 / %o1; div=rem => %o0 % %o1 - * false false=true => signed; false=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top decade of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - - - .globl .udiv - .globl _Udiv -.udiv: -_Udiv: /* needed for export */ - - ! Ready to divide. Compute size of quotient; scale comparand. - orcc %o1, %g0, %o5 - bne 1f - mov %o0, %o3 - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta ST_DIV0 - retl - clr %o0 - -1: - cmp %o3, %o5 ! if %o1 exceeds %o0, done - blu Lgot_result ! (and algorithm fails otherwise) - clr %o2 - - sethi %hi(1 << (32 - 4 - 1)), %g1 - - cmp %o3, %g1 - blu Lnot_really_big - clr %o4 - - ! Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R. - 1: - cmp %o5, %g1 - bgeu 3f - mov 1, %g7 - - sll %o5, 4, %o5 - - b 1b - add %o4, 1, %o4 - - ! Now compute %g7. - 2: - addcc %o5, %o5, %o5 - bcc Lnot_too_big - add %g7, 1, %g7 - - ! We get here if the %o1 overflowed while shifting. - ! This means that %o3 has the high-order bit set. - ! Restore %o5 and subtract from %o3. - sll %g1, 4, %g1 ! high order bit - srl %o5, 1, %o5 ! rest of %o5 - add %o5, %g1, %o5 - - b Ldo_single_div - sub %g7, 1, %g7 - - Lnot_too_big: - 3: - cmp %o5, %o3 - blu 2b - nop - - be Ldo_single_div - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! %o5 > %o3: went too far: back up 1 step - ! srl %o5, 1, %o5 - ! dec %g7 - ! do single-bit divide steps - ! - ! We have to be careful here. We know that %o3 >= %o5, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - Ldo_single_div: - subcc %g7, 1, %g7 - bl Lend_regular_divide - nop - - sub %o3, %o5, %o3 - mov 1, %o2 - - b Lend_single_divloop - nop - Lsingle_divloop: - sll %o2, 1, %o2 - bl 1f - srl %o5, 1, %o5 - ! %o3 >= 0 - sub %o3, %o5, %o3 - b 2f - add %o2, 1, %o2 - 1: ! %o3 < 0 - add %o3, %o5, %o3 - sub %o2, 1, %o2 - 2: - Lend_single_divloop: - subcc %g7, 1, %g7 - bge Lsingle_divloop - tst %o3 - - b,a Lend_regular_divide - -Lnot_really_big: -1: - sll %o5, 4, %o5 - - cmp %o5, %o3 - bleu 1b - addcc %o4, 1, %o4 - - be Lgot_result - sub %o4, 1, %o4 - - tst %o3 ! set up for initial iteration -Ldivloop: - sll %o2, 4, %o2 - ! depth 1, accumulated bits 0 - bl L.1.16 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 2, accumulated bits 1 - bl L.2.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits 3 - bl L.3.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 7 - bl L.4.23 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - -L.4.23: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - -L.3.19: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 5 - bl L.4.21 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - -L.4.21: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - -L.2.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits 1 - bl L.3.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 3 - bl L.4.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - -L.4.19: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - -L.3.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 1 - bl L.4.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - -L.4.17: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - -L.1.16: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 2, accumulated bits -1 - bl L.2.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits -1 - bl L.3.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -1 - bl L.4.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - -L.4.15: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - -L.3.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -3 - bl L.4.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - -L.4.13: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - -L.2.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits -3 - bl L.3.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -5 - bl L.4.11 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - -L.4.11: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - -L.3.13: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -7 - bl L.4.9 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - -L.4.9: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - 9: -Lend_regular_divide: - subcc %o4, 1, %o4 - bge Ldivloop - tst %o3 - - bl,a Lgot_result - ! non-restoring fixup here (one instruction only!) - sub %o2, 1, %o2 - -Lgot_result: - - retl - mov %o2, %o0 - - .globl .udiv_patch -.udiv_patch: - wr %g0, 0x0, %y - nop - nop - retl - udiv %o0, %o1, %o0 - nop diff --git a/arch/sparc/lib/udivdi3.S b/arch/sparc/lib/udivdi3.S index b430f1f0ef6..24e0a355e2e 100644 --- a/arch/sparc/lib/udivdi3.S +++ b/arch/sparc/lib/udivdi3.S @@ -60,8 +60,9 @@ __udivdi3: bne .LL77 mov %i0,%o2 mov 1,%o0 - call .udiv,0 mov 0,%o1 + wr %g0, 0, %y + udiv %o0, %o1, %o0 mov %o0,%o3 mov %i0,%o2 .LL77: diff --git a/arch/sparc/lib/umul.S b/arch/sparc/lib/umul.S deleted file mode 100644 index f0e5b20a253..00000000000 --- a/arch/sparc/lib/umul.S +++ /dev/null @@ -1,171 +0,0 @@ -/* $Id: umul.S,v 1.4 1996/09/30 02:22:39 davem Exp $ - * umul.S: This routine was taken from glibc-1.09 and is covered - * by the GNU Library General Public License Version 2. - */ - - -/* - * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the - * upper 32 bits of the 64-bit product). - * - * This code optimizes short (less than 13-bit) multiplies. Short - * multiplies require 25 instruction cycles, and long ones require - * 45 instruction cycles. - * - * On return, overflow has occurred (%o1 is not zero) if and only if - * the Z condition code is clear, allowing, e.g., the following: - * - * call .umul - * nop - * bnz overflow (or tnz) - */ - - .globl .umul - .globl _Umul -.umul: -_Umul: /* needed for export */ - or %o0, %o1, %o4 - mov %o0, %y ! multiplier -> Y - - andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args - be Lmul_shortway ! if zero, can do it the short way - andcc %g0, %g0, %o4 ! zero the partial product and clear N and V - - /* - * Long multiply. 32 steps, followed by a final shift step. - */ - mulscc %o4, %o1, %o4 ! 1 - mulscc %o4, %o1, %o4 ! 2 - mulscc %o4, %o1, %o4 ! 3 - mulscc %o4, %o1, %o4 ! 4 - mulscc %o4, %o1, %o4 ! 5 - mulscc %o4, %o1, %o4 ! 6 - mulscc %o4, %o1, %o4 ! 7 - mulscc %o4, %o1, %o4 ! 8 - mulscc %o4, %o1, %o4 ! 9 - mulscc %o4, %o1, %o4 ! 10 - mulscc %o4, %o1, %o4 ! 11 - mulscc %o4, %o1, %o4 ! 12 - mulscc %o4, %o1, %o4 ! 13 - mulscc %o4, %o1, %o4 ! 14 - mulscc %o4, %o1, %o4 ! 15 - mulscc %o4, %o1, %o4 ! 16 - mulscc %o4, %o1, %o4 ! 17 - mulscc %o4, %o1, %o4 ! 18 - mulscc %o4, %o1, %o4 ! 19 - mulscc %o4, %o1, %o4 ! 20 - mulscc %o4, %o1, %o4 ! 21 - mulscc %o4, %o1, %o4 ! 22 - mulscc %o4, %o1, %o4 ! 23 - mulscc %o4, %o1, %o4 ! 24 - mulscc %o4, %o1, %o4 ! 25 - mulscc %o4, %o1, %o4 ! 26 - mulscc %o4, %o1, %o4 ! 27 - mulscc %o4, %o1, %o4 ! 28 - mulscc %o4, %o1, %o4 ! 29 - mulscc %o4, %o1, %o4 ! 30 - mulscc %o4, %o1, %o4 ! 31 - mulscc %o4, %o1, %o4 ! 32 - mulscc %o4, %g0, %o4 ! final shift - - - /* - * Normally, with the shift-and-add approach, if both numbers are - * positive you get the correct result. With 32-bit two's-complement - * numbers, -x is represented as - * - * x 32 - * ( 2 - ------ ) mod 2 * 2 - * 32 - * 2 - * - * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s, - * we can treat this as if the radix point were just to the left - * of the sign bit (multiply by 2^32), and get - * - * -x = (2 - x) mod 2 - * - * Then, ignoring the `mod 2's for convenience: - * - * x * y = xy - * -x * y = 2y - xy - * x * -y = 2x - xy - * -x * -y = 4 - 2x - 2y + xy - * - * For signed multiplies, we subtract (x << 32) from the partial - * product to fix this problem for negative multipliers (see mul.s). - * Because of the way the shift into the partial product is calculated - * (N xor V), this term is automatically removed for the multiplicand, - * so we don't have to adjust. - * - * But for unsigned multiplies, the high order bit wasn't a sign bit, - * and the correction is wrong. So for unsigned multiplies where the - * high order bit is one, we end up with xy - (y << 32). To fix it - * we add y << 32. - */ -#if 0 - tst %o1 - bl,a 1f ! if %o1 < 0 (high order bit = 1), - add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half) - -1: - rd %y, %o0 ! get lower half of product - retl - addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0 -#else - /* Faster code from tege@sics.se. */ - sra %o1, 31, %o2 ! make mask from sign bit - and %o0, %o2, %o2 ! %o2 = 0 or %o0, depending on sign of %o1 - rd %y, %o0 ! get lower half of product - retl - addcc %o4, %o2, %o1 ! add compensation and put upper half in place -#endif - -Lmul_shortway: - /* - * Short multiply. 12 steps, followed by a final shift step. - * The resulting bits are off by 12 and (32-12) = 20 bit positions, - * but there is no problem with %o0 being negative (unlike above), - * and overflow is impossible (the answer is at most 24 bits long). - */ - mulscc %o4, %o1, %o4 ! 1 - mulscc %o4, %o1, %o4 ! 2 - mulscc %o4, %o1, %o4 ! 3 - mulscc %o4, %o1, %o4 ! 4 - mulscc %o4, %o1, %o4 ! 5 - mulscc %o4, %o1, %o4 ! 6 - mulscc %o4, %o1, %o4 ! 7 - mulscc %o4, %o1, %o4 ! 8 - mulscc %o4, %o1, %o4 ! 9 - mulscc %o4, %o1, %o4 ! 10 - mulscc %o4, %o1, %o4 ! 11 - mulscc %o4, %o1, %o4 ! 12 - mulscc %o4, %g0, %o4 ! final shift - - /* - * %o4 has 20 of the bits that should be in the result; %y has - * the bottom 12 (as %y's top 12). That is: - * - * %o4 %y - * +----------------+----------------+ - * | -12- | -20- | -12- | -20- | - * +------(---------+------)---------+ - * -----result----- - * - * The 12 bits of %o4 left of the `result' area are all zero; - * in fact, all top 20 bits of %o4 are zero. - */ - - rd %y, %o5 - sll %o4, 12, %o0 ! shift middle bits left 12 - srl %o5, 20, %o5 ! shift low bits right 20 - or %o5, %o0, %o0 - retl - addcc %g0, %g0, %o1 ! %o1 = zero, and set Z - - .globl .umul_patch -.umul_patch: - umul %o0, %o1, %o0 - retl - rd %y, %o1 - nop diff --git a/arch/sparc/lib/urem.S b/arch/sparc/lib/urem.S deleted file mode 100644 index 6b92bdc8b04..00000000000 --- a/arch/sparc/lib/urem.S +++ /dev/null @@ -1,357 +0,0 @@ -/* $Id: urem.S,v 1.4 1996/09/30 02:22:42 davem Exp $ - * urem.S: This routine was taken from glibc-1.09 and is covered - * by the GNU Library General Public License Version 2. - */ - -/* This file is generated from divrem.m4; DO NOT EDIT! */ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * .urem name of function to generate - * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1 - * false false=true => signed; false=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top decade of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - - .globl .urem - .globl _Urem -.urem: -_Urem: /* needed for export */ - - ! Ready to divide. Compute size of quotient; scale comparand. - orcc %o1, %g0, %o5 - bne 1f - mov %o0, %o3 - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta ST_DIV0 - retl - clr %o0 - -1: - cmp %o3, %o5 ! if %o1 exceeds %o0, done - blu Lgot_result ! (and algorithm fails otherwise) - clr %o2 - - sethi %hi(1 << (32 - 4 - 1)), %g1 - - cmp %o3, %g1 - blu Lnot_really_big - clr %o4 - - ! Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R. - 1: - cmp %o5, %g1 - bgeu 3f - mov 1, %g7 - - sll %o5, 4, %o5 - - b 1b - add %o4, 1, %o4 - - ! Now compute %g7. - 2: - addcc %o5, %o5, %o5 - bcc Lnot_too_big - add %g7, 1, %g7 - - ! We get here if the %o1 overflowed while shifting. - ! This means that %o3 has the high-order bit set. - ! Restore %o5 and subtract from %o3. - sll %g1, 4, %g1 ! high order bit - srl %o5, 1, %o5 ! rest of %o5 - add %o5, %g1, %o5 - - b Ldo_single_div - sub %g7, 1, %g7 - - Lnot_too_big: - 3: - cmp %o5, %o3 - blu 2b - nop - - be Ldo_single_div - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! %o5 > %o3: went too far: back up 1 step - ! srl %o5, 1, %o5 - ! dec %g7 - ! do single-bit divide steps - ! - ! We have to be careful here. We know that %o3 >= %o5, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - Ldo_single_div: - subcc %g7, 1, %g7 - bl Lend_regular_divide - nop - - sub %o3, %o5, %o3 - mov 1, %o2 - - b Lend_single_divloop - nop - Lsingle_divloop: - sll %o2, 1, %o2 - bl 1f - srl %o5, 1, %o5 - ! %o3 >= 0 - sub %o3, %o5, %o3 - b 2f - add %o2, 1, %o2 - 1: ! %o3 < 0 - add %o3, %o5, %o3 - sub %o2, 1, %o2 - 2: - Lend_single_divloop: - subcc %g7, 1, %g7 - bge Lsingle_divloop - tst %o3 - - b,a Lend_regular_divide - -Lnot_really_big: -1: - sll %o5, 4, %o5 - - cmp %o5, %o3 - bleu 1b - addcc %o4, 1, %o4 - - be Lgot_result - sub %o4, 1, %o4 - - tst %o3 ! set up for initial iteration -Ldivloop: - sll %o2, 4, %o2 - ! depth 1, accumulated bits 0 - bl L.1.16 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 2, accumulated bits 1 - bl L.2.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits 3 - bl L.3.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 7 - bl L.4.23 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - -L.4.23: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - -L.3.19: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 5 - bl L.4.21 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - -L.4.21: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - -L.2.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits 1 - bl L.3.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 3 - bl L.4.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - -L.4.19: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - -L.3.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 1 - bl L.4.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - -L.4.17: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - -L.1.16: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 2, accumulated bits -1 - bl L.2.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits -1 - bl L.3.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -1 - bl L.4.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - -L.4.15: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - -L.3.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -3 - bl L.4.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - -L.4.13: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - -L.2.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits -3 - bl L.3.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -5 - bl L.4.11 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - -L.4.11: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - -L.3.13: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -7 - bl L.4.9 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - -L.4.9: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - 9: -Lend_regular_divide: - subcc %o4, 1, %o4 - bge Ldivloop - tst %o3 - - bl,a Lgot_result - ! non-restoring fixup here (one instruction only!) - add %o3, %o1, %o3 - -Lgot_result: - - retl - mov %o3, %o0 - - .globl .urem_patch -.urem_patch: - wr %g0, 0x0, %y - nop - nop - nop - udiv %o0, %o1, %o2 - umul %o2, %o1, %o2 - retl - sub %o0, %o2, %o0 diff --git a/arch/sparc/lib/user_fixup.c b/arch/sparc/lib/user_fixup.c new file mode 100644 index 00000000000..ac96ae23670 --- /dev/null +++ b/arch/sparc/lib/user_fixup.c @@ -0,0 +1,71 @@ +/* user_fixup.c: Fix up user copy faults. + * + * Copyright (C) 2004 David S. Miller <davem@redhat.com> + */ + +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/module.h> + +#include <asm/uaccess.h> + +/* Calculating the exact fault address when using + * block loads and stores can be very complicated. + * + * Instead of trying to be clever and handling all + * of the cases, just fix things up simply here. + */ + +static unsigned long compute_size(unsigned long start, unsigned long size, unsigned long *offset) +{ + unsigned long fault_addr = current_thread_info()->fault_address; + unsigned long end = start + size; + + if (fault_addr < start || fault_addr >= end) { + *offset = 0; + } else { + *offset = fault_addr - start; + size = end - fault_addr; + } + return size; +} + +unsigned long copy_from_user_fixup(void *to, const void __user *from, unsigned long size) +{ + unsigned long offset; + + size = compute_size((unsigned long) from, size, &offset); + if (likely(size)) + memset(to + offset, 0, size); + + return size; +} +EXPORT_SYMBOL(copy_from_user_fixup); + +unsigned long copy_to_user_fixup(void __user *to, const void *from, unsigned long size) +{ + unsigned long offset; + + return compute_size((unsigned long) to, size, &offset); +} +EXPORT_SYMBOL(copy_to_user_fixup); + +unsigned long copy_in_user_fixup(void __user *to, void __user *from, unsigned long size) +{ + unsigned long fault_addr = current_thread_info()->fault_address; + unsigned long start = (unsigned long) to; + unsigned long end = start + size; + + if (fault_addr >= start && fault_addr < end) + return end - fault_addr; + + start = (unsigned long) from; + end = start + size; + if (fault_addr >= start && fault_addr < end) + return end - fault_addr; + + return size; +} +EXPORT_SYMBOL(copy_in_user_fixup); diff --git a/arch/sparc/lib/xor.S b/arch/sparc/lib/xor.S new file mode 100644 index 00000000000..2c05641c326 --- /dev/null +++ b/arch/sparc/lib/xor.S @@ -0,0 +1,636 @@ +/* + * arch/sparc64/lib/xor.S + * + * High speed xor_block operation for RAID4/5 utilizing the + * UltraSparc Visual Instruction Set and Niagara store-init/twin-load. + * + * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) + * Copyright (C) 2006 David S. Miller <davem@davemloft.net> + */ + +#include <linux/linkage.h> +#include <asm/visasm.h> +#include <asm/asi.h> +#include <asm/dcu.h> +#include <asm/spitfire.h> + +/* + * Requirements: + * !(((long)dest | (long)sourceN) & (64 - 1)) && + * !(len & 127) && len >= 256 + */ + .text + + /* VIS versions. */ +ENTRY(xor_vis_2) + rd %fprs, %o5 + andcc %o5, FPRS_FEF|FPRS_DU, %g0 + be,pt %icc, 0f + sethi %hi(VISenter), %g1 + jmpl %g1 + %lo(VISenter), %g7 + add %g7, 8, %g7 +0: wr %g0, FPRS_FEF, %fprs + rd %asi, %g1 + wr %g0, ASI_BLK_P, %asi + membar #LoadStore|#StoreLoad|#StoreStore + sub %o0, 128, %o0 + ldda [%o1] %asi, %f0 + ldda [%o2] %asi, %f16 + +2: ldda [%o1 + 64] %asi, %f32 + fxor %f0, %f16, %f16 + fxor %f2, %f18, %f18 + fxor %f4, %f20, %f20 + fxor %f6, %f22, %f22 + fxor %f8, %f24, %f24 + fxor %f10, %f26, %f26 + fxor %f12, %f28, %f28 + fxor %f14, %f30, %f30 + stda %f16, [%o1] %asi + ldda [%o2 + 64] %asi, %f48 + ldda [%o1 + 128] %asi, %f0 + fxor %f32, %f48, %f48 + fxor %f34, %f50, %f50 + add %o1, 128, %o1 + fxor %f36, %f52, %f52 + add %o2, 128, %o2 + fxor %f38, %f54, %f54 + subcc %o0, 128, %o0 + fxor %f40, %f56, %f56 + fxor %f42, %f58, %f58 + fxor %f44, %f60, %f60 + fxor %f46, %f62, %f62 + stda %f48, [%o1 - 64] %asi + bne,pt %xcc, 2b + ldda [%o2] %asi, %f16 + + ldda [%o1 + 64] %asi, %f32 + fxor %f0, %f16, %f16 + fxor %f2, %f18, %f18 + fxor %f4, %f20, %f20 + fxor %f6, %f22, %f22 + fxor %f8, %f24, %f24 + fxor %f10, %f26, %f26 + fxor %f12, %f28, %f28 + fxor %f14, %f30, %f30 + stda %f16, [%o1] %asi + ldda [%o2 + 64] %asi, %f48 + membar #Sync + fxor %f32, %f48, %f48 + fxor %f34, %f50, %f50 + fxor %f36, %f52, %f52 + fxor %f38, %f54, %f54 + fxor %f40, %f56, %f56 + fxor %f42, %f58, %f58 + fxor %f44, %f60, %f60 + fxor %f46, %f62, %f62 + stda %f48, [%o1 + 64] %asi + membar #Sync|#StoreStore|#StoreLoad + wr %g1, %g0, %asi + retl + wr %g0, 0, %fprs +ENDPROC(xor_vis_2) + +ENTRY(xor_vis_3) + rd %fprs, %o5 + andcc %o5, FPRS_FEF|FPRS_DU, %g0 + be,pt %icc, 0f + sethi %hi(VISenter), %g1 + jmpl %g1 + %lo(VISenter), %g7 + add %g7, 8, %g7 +0: wr %g0, FPRS_FEF, %fprs + rd %asi, %g1 + wr %g0, ASI_BLK_P, %asi + membar #LoadStore|#StoreLoad|#StoreStore + sub %o0, 64, %o0 + ldda [%o1] %asi, %f0 + ldda [%o2] %asi, %f16 + +3: ldda [%o3] %asi, %f32 + fxor %f0, %f16, %f48 + fxor %f2, %f18, %f50 + add %o1, 64, %o1 + fxor %f4, %f20, %f52 + fxor %f6, %f22, %f54 + add %o2, 64, %o2 + fxor %f8, %f24, %f56 + fxor %f10, %f26, %f58 + fxor %f12, %f28, %f60 + fxor %f14, %f30, %f62 + ldda [%o1] %asi, %f0 + fxor %f48, %f32, %f48 + fxor %f50, %f34, %f50 + fxor %f52, %f36, %f52 + fxor %f54, %f38, %f54 + add %o3, 64, %o3 + fxor %f56, %f40, %f56 + fxor %f58, %f42, %f58 + subcc %o0, 64, %o0 + fxor %f60, %f44, %f60 + fxor %f62, %f46, %f62 + stda %f48, [%o1 - 64] %asi + bne,pt %xcc, 3b + ldda [%o2] %asi, %f16 + + ldda [%o3] %asi, %f32 + fxor %f0, %f16, %f48 + fxor %f2, %f18, %f50 + fxor %f4, %f20, %f52 + fxor %f6, %f22, %f54 + fxor %f8, %f24, %f56 + fxor %f10, %f26, %f58 + fxor %f12, %f28, %f60 + fxor %f14, %f30, %f62 + membar #Sync + fxor %f48, %f32, %f48 + fxor %f50, %f34, %f50 + fxor %f52, %f36, %f52 + fxor %f54, %f38, %f54 + fxor %f56, %f40, %f56 + fxor %f58, %f42, %f58 + fxor %f60, %f44, %f60 + fxor %f62, %f46, %f62 + stda %f48, [%o1] %asi + membar #Sync|#StoreStore|#StoreLoad + wr %g1, %g0, %asi + retl + wr %g0, 0, %fprs +ENDPROC(xor_vis_3) + +ENTRY(xor_vis_4) + rd %fprs, %o5 + andcc %o5, FPRS_FEF|FPRS_DU, %g0 + be,pt %icc, 0f + sethi %hi(VISenter), %g1 + jmpl %g1 + %lo(VISenter), %g7 + add %g7, 8, %g7 +0: wr %g0, FPRS_FEF, %fprs + rd %asi, %g1 + wr %g0, ASI_BLK_P, %asi + membar #LoadStore|#StoreLoad|#StoreStore + sub %o0, 64, %o0 + ldda [%o1] %asi, %f0 + ldda [%o2] %asi, %f16 + +4: ldda [%o3] %asi, %f32 + fxor %f0, %f16, %f16 + fxor %f2, %f18, %f18 + add %o1, 64, %o1 + fxor %f4, %f20, %f20 + fxor %f6, %f22, %f22 + add %o2, 64, %o2 + fxor %f8, %f24, %f24 + fxor %f10, %f26, %f26 + fxor %f12, %f28, %f28 + fxor %f14, %f30, %f30 + ldda [%o4] %asi, %f48 + fxor %f16, %f32, %f32 + fxor %f18, %f34, %f34 + fxor %f20, %f36, %f36 + fxor %f22, %f38, %f38 + add %o3, 64, %o3 + fxor %f24, %f40, %f40 + fxor %f26, %f42, %f42 + fxor %f28, %f44, %f44 + fxor %f30, %f46, %f46 + ldda [%o1] %asi, %f0 + fxor %f32, %f48, %f48 + fxor %f34, %f50, %f50 + fxor %f36, %f52, %f52 + add %o4, 64, %o4 + fxor %f38, %f54, %f54 + fxor %f40, %f56, %f56 + fxor %f42, %f58, %f58 + subcc %o0, 64, %o0 + fxor %f44, %f60, %f60 + fxor %f46, %f62, %f62 + stda %f48, [%o1 - 64] %asi + bne,pt %xcc, 4b + ldda [%o2] %asi, %f16 + + ldda [%o3] %asi, %f32 + fxor %f0, %f16, %f16 + fxor %f2, %f18, %f18 + fxor %f4, %f20, %f20 + fxor %f6, %f22, %f22 + fxor %f8, %f24, %f24 + fxor %f10, %f26, %f26 + fxor %f12, %f28, %f28 + fxor %f14, %f30, %f30 + ldda [%o4] %asi, %f48 + fxor %f16, %f32, %f32 + fxor %f18, %f34, %f34 + fxor %f20, %f36, %f36 + fxor %f22, %f38, %f38 + fxor %f24, %f40, %f40 + fxor %f26, %f42, %f42 + fxor %f28, %f44, %f44 + fxor %f30, %f46, %f46 + membar #Sync + fxor %f32, %f48, %f48 + fxor %f34, %f50, %f50 + fxor %f36, %f52, %f52 + fxor %f38, %f54, %f54 + fxor %f40, %f56, %f56 + fxor %f42, %f58, %f58 + fxor %f44, %f60, %f60 + fxor %f46, %f62, %f62 + stda %f48, [%o1] %asi + membar #Sync|#StoreStore|#StoreLoad + wr %g1, %g0, %asi + retl + wr %g0, 0, %fprs +ENDPROC(xor_vis_4) + +ENTRY(xor_vis_5) + save %sp, -192, %sp + rd %fprs, %o5 + andcc %o5, FPRS_FEF|FPRS_DU, %g0 + be,pt %icc, 0f + sethi %hi(VISenter), %g1 + jmpl %g1 + %lo(VISenter), %g7 + add %g7, 8, %g7 +0: wr %g0, FPRS_FEF, %fprs + rd %asi, %g1 + wr %g0, ASI_BLK_P, %asi + membar #LoadStore|#StoreLoad|#StoreStore + sub %i0, 64, %i0 + ldda [%i1] %asi, %f0 + ldda [%i2] %asi, %f16 + +5: ldda [%i3] %asi, %f32 + fxor %f0, %f16, %f48 + fxor %f2, %f18, %f50 + add %i1, 64, %i1 + fxor %f4, %f20, %f52 + fxor %f6, %f22, %f54 + add %i2, 64, %i2 + fxor %f8, %f24, %f56 + fxor %f10, %f26, %f58 + fxor %f12, %f28, %f60 + fxor %f14, %f30, %f62 + ldda [%i4] %asi, %f16 + fxor %f48, %f32, %f48 + fxor %f50, %f34, %f50 + fxor %f52, %f36, %f52 + fxor %f54, %f38, %f54 + add %i3, 64, %i3 + fxor %f56, %f40, %f56 + fxor %f58, %f42, %f58 + fxor %f60, %f44, %f60 + fxor %f62, %f46, %f62 + ldda [%i5] %asi, %f32 + fxor %f48, %f16, %f48 + fxor %f50, %f18, %f50 + add %i4, 64, %i4 + fxor %f52, %f20, %f52 + fxor %f54, %f22, %f54 + add %i5, 64, %i5 + fxor %f56, %f24, %f56 + fxor %f58, %f26, %f58 + fxor %f60, %f28, %f60 + fxor %f62, %f30, %f62 + ldda [%i1] %asi, %f0 + fxor %f48, %f32, %f48 + fxor %f50, %f34, %f50 + fxor %f52, %f36, %f52 + fxor %f54, %f38, %f54 + fxor %f56, %f40, %f56 + fxor %f58, %f42, %f58 + subcc %i0, 64, %i0 + fxor %f60, %f44, %f60 + fxor %f62, %f46, %f62 + stda %f48, [%i1 - 64] %asi + bne,pt %xcc, 5b + ldda [%i2] %asi, %f16 + + ldda [%i3] %asi, %f32 + fxor %f0, %f16, %f48 + fxor %f2, %f18, %f50 + fxor %f4, %f20, %f52 + fxor %f6, %f22, %f54 + fxor %f8, %f24, %f56 + fxor %f10, %f26, %f58 + fxor %f12, %f28, %f60 + fxor %f14, %f30, %f62 + ldda [%i4] %asi, %f16 + fxor %f48, %f32, %f48 + fxor %f50, %f34, %f50 + fxor %f52, %f36, %f52 + fxor %f54, %f38, %f54 + fxor %f56, %f40, %f56 + fxor %f58, %f42, %f58 + fxor %f60, %f44, %f60 + fxor %f62, %f46, %f62 + ldda [%i5] %asi, %f32 + fxor %f48, %f16, %f48 + fxor %f50, %f18, %f50 + fxor %f52, %f20, %f52 + fxor %f54, %f22, %f54 + fxor %f56, %f24, %f56 + fxor %f58, %f26, %f58 + fxor %f60, %f28, %f60 + fxor %f62, %f30, %f62 + membar #Sync + fxor %f48, %f32, %f48 + fxor %f50, %f34, %f50 + fxor %f52, %f36, %f52 + fxor %f54, %f38, %f54 + fxor %f56, %f40, %f56 + fxor %f58, %f42, %f58 + fxor %f60, %f44, %f60 + fxor %f62, %f46, %f62 + stda %f48, [%i1] %asi + membar #Sync|#StoreStore|#StoreLoad + wr %g1, %g0, %asi + wr %g0, 0, %fprs + ret + restore +ENDPROC(xor_vis_5) + + /* Niagara versions. */ +ENTRY(xor_niagara_2) /* %o0=bytes, %o1=dest, %o2=src */ + save %sp, -192, %sp + prefetch [%i1], #n_writes + prefetch [%i2], #one_read + rd %asi, %g7 + wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi + srlx %i0, 6, %g1 + mov %i1, %i0 + mov %i2, %i1 +1: ldda [%i1 + 0x00] %asi, %i2 /* %i2/%i3 = src + 0x00 */ + ldda [%i1 + 0x10] %asi, %i4 /* %i4/%i5 = src + 0x10 */ + ldda [%i1 + 0x20] %asi, %g2 /* %g2/%g3 = src + 0x20 */ + ldda [%i1 + 0x30] %asi, %l0 /* %l0/%l1 = src + 0x30 */ + prefetch [%i1 + 0x40], #one_read + ldda [%i0 + 0x00] %asi, %o0 /* %o0/%o1 = dest + 0x00 */ + ldda [%i0 + 0x10] %asi, %o2 /* %o2/%o3 = dest + 0x10 */ + ldda [%i0 + 0x20] %asi, %o4 /* %o4/%o5 = dest + 0x20 */ + ldda [%i0 + 0x30] %asi, %l2 /* %l2/%l3 = dest + 0x30 */ + prefetch [%i0 + 0x40], #n_writes + xor %o0, %i2, %o0 + xor %o1, %i3, %o1 + stxa %o0, [%i0 + 0x00] %asi + stxa %o1, [%i0 + 0x08] %asi + xor %o2, %i4, %o2 + xor %o3, %i5, %o3 + stxa %o2, [%i0 + 0x10] %asi + stxa %o3, [%i0 + 0x18] %asi + xor %o4, %g2, %o4 + xor %o5, %g3, %o5 + stxa %o4, [%i0 + 0x20] %asi + stxa %o5, [%i0 + 0x28] %asi + xor %l2, %l0, %l2 + xor %l3, %l1, %l3 + stxa %l2, [%i0 + 0x30] %asi + stxa %l3, [%i0 + 0x38] %asi + add %i0, 0x40, %i0 + subcc %g1, 1, %g1 + bne,pt %xcc, 1b + add %i1, 0x40, %i1 + membar #Sync + wr %g7, 0x0, %asi + ret + restore +ENDPROC(xor_niagara_2) + +ENTRY(xor_niagara_3) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2 */ + save %sp, -192, %sp + prefetch [%i1], #n_writes + prefetch [%i2], #one_read + prefetch [%i3], #one_read + rd %asi, %g7 + wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi + srlx %i0, 6, %g1 + mov %i1, %i0 + mov %i2, %i1 + mov %i3, %l7 +1: ldda [%i1 + 0x00] %asi, %i2 /* %i2/%i3 = src1 + 0x00 */ + ldda [%i1 + 0x10] %asi, %i4 /* %i4/%i5 = src1 + 0x10 */ + ldda [%l7 + 0x00] %asi, %g2 /* %g2/%g3 = src2 + 0x00 */ + ldda [%l7 + 0x10] %asi, %l0 /* %l0/%l1 = src2 + 0x10 */ + ldda [%i0 + 0x00] %asi, %o0 /* %o0/%o1 = dest + 0x00 */ + ldda [%i0 + 0x10] %asi, %o2 /* %o2/%o3 = dest + 0x10 */ + xor %g2, %i2, %g2 + xor %g3, %i3, %g3 + xor %o0, %g2, %o0 + xor %o1, %g3, %o1 + stxa %o0, [%i0 + 0x00] %asi + stxa %o1, [%i0 + 0x08] %asi + ldda [%i1 + 0x20] %asi, %i2 /* %i2/%i3 = src1 + 0x20 */ + ldda [%l7 + 0x20] %asi, %g2 /* %g2/%g3 = src2 + 0x20 */ + ldda [%i0 + 0x20] %asi, %o0 /* %o0/%o1 = dest + 0x20 */ + xor %l0, %i4, %l0 + xor %l1, %i5, %l1 + xor %o2, %l0, %o2 + xor %o3, %l1, %o3 + stxa %o2, [%i0 + 0x10] %asi + stxa %o3, [%i0 + 0x18] %asi + ldda [%i1 + 0x30] %asi, %i4 /* %i4/%i5 = src1 + 0x30 */ + ldda [%l7 + 0x30] %asi, %l0 /* %l0/%l1 = src2 + 0x30 */ + ldda [%i0 + 0x30] %asi, %o2 /* %o2/%o3 = dest + 0x30 */ + prefetch [%i1 + 0x40], #one_read + prefetch [%l7 + 0x40], #one_read + prefetch [%i0 + 0x40], #n_writes + xor %g2, %i2, %g2 + xor %g3, %i3, %g3 + xor %o0, %g2, %o0 + xor %o1, %g3, %o1 + stxa %o0, [%i0 + 0x20] %asi + stxa %o1, [%i0 + 0x28] %asi + xor %l0, %i4, %l0 + xor %l1, %i5, %l1 + xor %o2, %l0, %o2 + xor %o3, %l1, %o3 + stxa %o2, [%i0 + 0x30] %asi + stxa %o3, [%i0 + 0x38] %asi + add %i0, 0x40, %i0 + add %i1, 0x40, %i1 + subcc %g1, 1, %g1 + bne,pt %xcc, 1b + add %l7, 0x40, %l7 + membar #Sync + wr %g7, 0x0, %asi + ret + restore +ENDPROC(xor_niagara_3) + +ENTRY(xor_niagara_4) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3 */ + save %sp, -192, %sp + prefetch [%i1], #n_writes + prefetch [%i2], #one_read + prefetch [%i3], #one_read + prefetch [%i4], #one_read + rd %asi, %g7 + wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi + srlx %i0, 6, %g1 + mov %i1, %i0 + mov %i2, %i1 + mov %i3, %l7 + mov %i4, %l6 +1: ldda [%i1 + 0x00] %asi, %i2 /* %i2/%i3 = src1 + 0x00 */ + ldda [%l7 + 0x00] %asi, %i4 /* %i4/%i5 = src2 + 0x00 */ + ldda [%l6 + 0x00] %asi, %g2 /* %g2/%g3 = src3 + 0x00 */ + ldda [%i0 + 0x00] %asi, %l0 /* %l0/%l1 = dest + 0x00 */ + xor %i4, %i2, %i4 + xor %i5, %i3, %i5 + ldda [%i1 + 0x10] %asi, %i2 /* %i2/%i3 = src1 + 0x10 */ + xor %g2, %i4, %g2 + xor %g3, %i5, %g3 + ldda [%l7 + 0x10] %asi, %i4 /* %i4/%i5 = src2 + 0x10 */ + xor %l0, %g2, %l0 + xor %l1, %g3, %l1 + stxa %l0, [%i0 + 0x00] %asi + stxa %l1, [%i0 + 0x08] %asi + ldda [%l6 + 0x10] %asi, %g2 /* %g2/%g3 = src3 + 0x10 */ + ldda [%i0 + 0x10] %asi, %l0 /* %l0/%l1 = dest + 0x10 */ + + xor %i4, %i2, %i4 + xor %i5, %i3, %i5 + ldda [%i1 + 0x20] %asi, %i2 /* %i2/%i3 = src1 + 0x20 */ + xor %g2, %i4, %g2 + xor %g3, %i5, %g3 + ldda [%l7 + 0x20] %asi, %i4 /* %i4/%i5 = src2 + 0x20 */ + xor %l0, %g2, %l0 + xor %l1, %g3, %l1 + stxa %l0, [%i0 + 0x10] %asi + stxa %l1, [%i0 + 0x18] %asi + ldda [%l6 + 0x20] %asi, %g2 /* %g2/%g3 = src3 + 0x20 */ + ldda [%i0 + 0x20] %asi, %l0 /* %l0/%l1 = dest + 0x20 */ + + xor %i4, %i2, %i4 + xor %i5, %i3, %i5 + ldda [%i1 + 0x30] %asi, %i2 /* %i2/%i3 = src1 + 0x30 */ + xor %g2, %i4, %g2 + xor %g3, %i5, %g3 + ldda [%l7 + 0x30] %asi, %i4 /* %i4/%i5 = src2 + 0x30 */ + xor %l0, %g2, %l0 + xor %l1, %g3, %l1 + stxa %l0, [%i0 + 0x20] %asi + stxa %l1, [%i0 + 0x28] %asi + ldda [%l6 + 0x30] %asi, %g2 /* %g2/%g3 = src3 + 0x30 */ + ldda [%i0 + 0x30] %asi, %l0 /* %l0/%l1 = dest + 0x30 */ + + prefetch [%i1 + 0x40], #one_read + prefetch [%l7 + 0x40], #one_read + prefetch [%l6 + 0x40], #one_read + prefetch [%i0 + 0x40], #n_writes + + xor %i4, %i2, %i4 + xor %i5, %i3, %i5 + xor %g2, %i4, %g2 + xor %g3, %i5, %g3 + xor %l0, %g2, %l0 + xor %l1, %g3, %l1 + stxa %l0, [%i0 + 0x30] %asi + stxa %l1, [%i0 + 0x38] %asi + + add %i0, 0x40, %i0 + add %i1, 0x40, %i1 + add %l7, 0x40, %l7 + subcc %g1, 1, %g1 + bne,pt %xcc, 1b + add %l6, 0x40, %l6 + membar #Sync + wr %g7, 0x0, %asi + ret + restore +ENDPROC(xor_niagara_4) + +ENTRY(xor_niagara_5) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3, %o5=src4 */ + save %sp, -192, %sp + prefetch [%i1], #n_writes + prefetch [%i2], #one_read + prefetch [%i3], #one_read + prefetch [%i4], #one_read + prefetch [%i5], #one_read + rd %asi, %g7 + wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi + srlx %i0, 6, %g1 + mov %i1, %i0 + mov %i2, %i1 + mov %i3, %l7 + mov %i4, %l6 + mov %i5, %l5 +1: ldda [%i1 + 0x00] %asi, %i2 /* %i2/%i3 = src1 + 0x00 */ + ldda [%l7 + 0x00] %asi, %i4 /* %i4/%i5 = src2 + 0x00 */ + ldda [%l6 + 0x00] %asi, %g2 /* %g2/%g3 = src3 + 0x00 */ + ldda [%l5 + 0x00] %asi, %l0 /* %l0/%l1 = src4 + 0x00 */ + ldda [%i0 + 0x00] %asi, %l2 /* %l2/%l3 = dest + 0x00 */ + xor %i4, %i2, %i4 + xor %i5, %i3, %i5 + ldda [%i1 + 0x10] %asi, %i2 /* %i2/%i3 = src1 + 0x10 */ + xor %g2, %i4, %g2 + xor %g3, %i5, %g3 + ldda [%l7 + 0x10] %asi, %i4 /* %i4/%i5 = src2 + 0x10 */ + xor %l0, %g2, %l0 + xor %l1, %g3, %l1 + ldda [%l6 + 0x10] %asi, %g2 /* %g2/%g3 = src3 + 0x10 */ + xor %l2, %l0, %l2 + xor %l3, %l1, %l3 + stxa %l2, [%i0 + 0x00] %asi + stxa %l3, [%i0 + 0x08] %asi + ldda [%l5 + 0x10] %asi, %l0 /* %l0/%l1 = src4 + 0x10 */ + ldda [%i0 + 0x10] %asi, %l2 /* %l2/%l3 = dest + 0x10 */ + + xor %i4, %i2, %i4 + xor %i5, %i3, %i5 + ldda [%i1 + 0x20] %asi, %i2 /* %i2/%i3 = src1 + 0x20 */ + xor %g2, %i4, %g2 + xor %g3, %i5, %g3 + ldda [%l7 + 0x20] %asi, %i4 /* %i4/%i5 = src2 + 0x20 */ + xor %l0, %g2, %l0 + xor %l1, %g3, %l1 + ldda [%l6 + 0x20] %asi, %g2 /* %g2/%g3 = src3 + 0x20 */ + xor %l2, %l0, %l2 + xor %l3, %l1, %l3 + stxa %l2, [%i0 + 0x10] %asi + stxa %l3, [%i0 + 0x18] %asi + ldda [%l5 + 0x20] %asi, %l0 /* %l0/%l1 = src4 + 0x20 */ + ldda [%i0 + 0x20] %asi, %l2 /* %l2/%l3 = dest + 0x20 */ + + xor %i4, %i2, %i4 + xor %i5, %i3, %i5 + ldda [%i1 + 0x30] %asi, %i2 /* %i2/%i3 = src1 + 0x30 */ + xor %g2, %i4, %g2 + xor %g3, %i5, %g3 + ldda [%l7 + 0x30] %asi, %i4 /* %i4/%i5 = src2 + 0x30 */ + xor %l0, %g2, %l0 + xor %l1, %g3, %l1 + ldda [%l6 + 0x30] %asi, %g2 /* %g2/%g3 = src3 + 0x30 */ + xor %l2, %l0, %l2 + xor %l3, %l1, %l3 + stxa %l2, [%i0 + 0x20] %asi + stxa %l3, [%i0 + 0x28] %asi + ldda [%l5 + 0x30] %asi, %l0 /* %l0/%l1 = src4 + 0x30 */ + ldda [%i0 + 0x30] %asi, %l2 /* %l2/%l3 = dest + 0x30 */ + + prefetch [%i1 + 0x40], #one_read + prefetch [%l7 + 0x40], #one_read + prefetch [%l6 + 0x40], #one_read + prefetch [%l5 + 0x40], #one_read + prefetch [%i0 + 0x40], #n_writes + + xor %i4, %i2, %i4 + xor %i5, %i3, %i5 + xor %g2, %i4, %g2 + xor %g3, %i5, %g3 + xor %l0, %g2, %l0 + xor %l1, %g3, %l1 + xor %l2, %l0, %l2 + xor %l3, %l1, %l3 + stxa %l2, [%i0 + 0x30] %asi + stxa %l3, [%i0 + 0x38] %asi + + add %i0, 0x40, %i0 + add %i1, 0x40, %i1 + add %l7, 0x40, %l7 + add %l6, 0x40, %l6 + subcc %g1, 1, %g1 + bne,pt %xcc, 1b + add %l5, 0x40, %l5 + membar #Sync + wr %g7, 0x0, %asi + ret + restore +ENDPROC(xor_niagara_5) |
