diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ia64/lib |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/ia64/lib')
-rw-r--r-- | arch/ia64/lib/Makefile | 52 | ||||
-rw-r--r-- | arch/ia64/lib/bitop.c | 88 | ||||
-rw-r--r-- | arch/ia64/lib/carta_random.S | 54 | ||||
-rw-r--r-- | arch/ia64/lib/checksum.c | 102 | ||||
-rw-r--r-- | arch/ia64/lib/clear_page.S | 77 | ||||
-rw-r--r-- | arch/ia64/lib/clear_user.S | 209 | ||||
-rw-r--r-- | arch/ia64/lib/copy_page.S | 98 | ||||
-rw-r--r-- | arch/ia64/lib/copy_page_mck.S | 185 | ||||
-rw-r--r-- | arch/ia64/lib/copy_user.S | 610 | ||||
-rw-r--r-- | arch/ia64/lib/csum_partial_copy.c | 151 | ||||
-rw-r--r-- | arch/ia64/lib/dec_and_lock.c | 42 | ||||
-rw-r--r-- | arch/ia64/lib/do_csum.S | 323 | ||||
-rw-r--r-- | arch/ia64/lib/flush.S | 39 | ||||
-rw-r--r-- | arch/ia64/lib/idiv32.S | 83 | ||||
-rw-r--r-- | arch/ia64/lib/idiv64.S | 80 | ||||
-rw-r--r-- | arch/ia64/lib/io.c | 165 | ||||
-rw-r--r-- | arch/ia64/lib/ip_fast_csum.S | 90 | ||||
-rw-r--r-- | arch/ia64/lib/memcpy.S | 301 | ||||
-rw-r--r-- | arch/ia64/lib/memcpy_mck.S | 661 | ||||
-rw-r--r-- | arch/ia64/lib/memset.S | 362 | ||||
-rw-r--r-- | arch/ia64/lib/strlen.S | 192 | ||||
-rw-r--r-- | arch/ia64/lib/strlen_user.S | 198 | ||||
-rw-r--r-- | arch/ia64/lib/strncpy_from_user.S | 44 | ||||
-rw-r--r-- | arch/ia64/lib/strnlen_user.S | 45 | ||||
-rw-r--r-- | arch/ia64/lib/swiotlb.c | 658 | ||||
-rw-r--r-- | arch/ia64/lib/xor.S | 184 |
26 files changed, 5093 insertions, 0 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile new file mode 100644 index 00000000000..1902c3c2ef9 --- /dev/null +++ b/arch/ia64/lib/Makefile @@ -0,0 +1,52 @@ +# +# Makefile for ia64-specific library routines.. +# + +obj-y := io.o + +lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ + __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \ + bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \ + clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \ + flush.o ip_fast_csum.o do_csum.o \ + memset.o strlen.o swiotlb.o + +lib-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o +lib-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o +lib-$(CONFIG_PERFMON) += carta_random.o +lib-$(CONFIG_MD_RAID5) += xor.o +lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o + +AFLAGS___divdi3.o = +AFLAGS___udivdi3.o = -DUNSIGNED +AFLAGS___moddi3.o = -DMODULO +AFLAGS___umoddi3.o = -DUNSIGNED -DMODULO + +AFLAGS___divsi3.o = +AFLAGS___udivsi3.o = -DUNSIGNED +AFLAGS___modsi3.o = -DMODULO +AFLAGS___umodsi3.o = -DUNSIGNED -DMODULO + +$(obj)/__divdi3.o: $(src)/idiv64.S FORCE + $(call if_changed_dep,as_o_S) + +$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE + $(call if_changed_dep,as_o_S) + +$(obj)/__moddi3.o: $(src)/idiv64.S FORCE + $(call if_changed_dep,as_o_S) + +$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE + $(call if_changed_dep,as_o_S) + +$(obj)/__divsi3.o: $(src)/idiv32.S FORCE + $(call if_changed_dep,as_o_S) + +$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE + $(call if_changed_dep,as_o_S) + +$(obj)/__modsi3.o: $(src)/idiv32.S FORCE + $(call if_changed_dep,as_o_S) + +$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE + $(call if_changed_dep,as_o_S) diff --git a/arch/ia64/lib/bitop.c b/arch/ia64/lib/bitop.c new file mode 100644 index 00000000000..82e299c8464 --- /dev/null +++ b/arch/ia64/lib/bitop.c @@ -0,0 +1,88 @@ +#include <linux/compiler.h> +#include <linux/types.h> +#include <asm/intrinsics.h> +#include <linux/module.h> +#include <linux/bitops.h> + +/* + * Find next zero bit in a bitmap reasonably efficiently.. + */ + +int __find_next_zero_bit (const void *addr, unsigned long size, unsigned long offset) +{ + unsigned long *p = ((unsigned long *) addr) + (offset >> 6); + unsigned long result = offset & ~63UL; + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 63UL; + if (offset) { + tmp = *(p++); + tmp |= ~0UL >> (64-offset); + if (size < 64) + goto found_first; + if (~tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~63UL) { + if (~(tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + if (!size) + return result; + tmp = *p; +found_first: + tmp |= ~0UL << size; + if (tmp == ~0UL) /* any bits zero? */ + return result + size; /* nope */ +found_middle: + return result + ffz(tmp); +} +EXPORT_SYMBOL(__find_next_zero_bit); + +/* + * Find next bit in a bitmap reasonably efficiently.. + */ +int __find_next_bit(const void *addr, unsigned long size, unsigned long offset) +{ + unsigned long *p = ((unsigned long *) addr) + (offset >> 6); + unsigned long result = offset & ~63UL; + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 63UL; + if (offset) { + tmp = *(p++); + tmp &= ~0UL << offset; + if (size < 64) + goto found_first; + if (tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~63UL) { + if ((tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + if (!size) + return result; + tmp = *p; + found_first: + tmp &= ~0UL >> (64-size); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ + found_middle: + return result + __ffs(tmp); +} +EXPORT_SYMBOL(__find_next_bit); diff --git a/arch/ia64/lib/carta_random.S b/arch/ia64/lib/carta_random.S new file mode 100644 index 00000000000..d0674c36036 --- /dev/null +++ b/arch/ia64/lib/carta_random.S @@ -0,0 +1,54 @@ +/* + * Fast, simple, yet decent quality random number generator based on + * a paper by David G. Carta ("Two Fast Implementations of the + * `Minimal Standard' Random Number Generator," Communications of the + * ACM, January, 1990). + * + * Copyright (C) 2002 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + */ + +#include <asm/asmmacro.h> + +#define a r2 +#define m r3 +#define lo r8 +#define hi r9 +#define t0 r16 +#define t1 r17 +#define seed r32 + +GLOBAL_ENTRY(carta_random32) + movl a = (16807 << 16) | 16807 + ;; + pmpyshr2.u t0 = a, seed, 0 + pmpyshr2.u t1 = a, seed, 16 + ;; + unpack2.l t0 = t1, t0 + dep m = -1, r0, 0, 31 + ;; + zxt4 lo = t0 + shr.u hi = t0, 32 + ;; + dep t0 = 0, hi, 15, 49 // t0 = (hi & 0x7fff) + ;; + shl t0 = t0, 16 // t0 = (hi & 0x7fff) << 16 + shr t1 = hi, 15 // t1 = (hi >> 15) + ;; + add lo = lo, t0 + ;; + cmp.gtu p6, p0 = lo, m + ;; +(p6) and lo = lo, m + ;; +(p6) add lo = 1, lo + ;; + add lo = lo, t1 + ;; + cmp.gtu p6, p0 = lo, m + ;; +(p6) and lo = lo, m + ;; +(p6) add lo = 1, lo + br.ret.sptk.many rp +END(carta_random32) diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c new file mode 100644 index 00000000000..beb11721d9f --- /dev/null +++ b/arch/ia64/lib/checksum.c @@ -0,0 +1,102 @@ +/* + * Network checksum routines + * + * Copyright (C) 1999, 2003 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * + * Most of the code coming from arch/alpha/lib/checksum.c + * + * This file contains network checksum routines that are better done + * in an architecture-specific manner due to speed.. + */ + +#include <linux/module.h> +#include <linux/string.h> + +#include <asm/byteorder.h> + +static inline unsigned short +from64to16 (unsigned long x) +{ + /* add up 32-bit words for 33 bits */ + x = (x & 0xffffffff) + (x >> 32); + /* add up 16-bit and 17-bit words for 17+c bits */ + x = (x & 0xffff) + (x >> 16); + /* add up 16-bit and 2-bit for 16+c bit */ + x = (x & 0xffff) + (x >> 16); + /* add up carry.. */ + x = (x & 0xffff) + (x >> 16); + return x; +} + +/* + * computes the checksum of the TCP/UDP pseudo-header + * returns a 16-bit checksum, already complemented. + */ +unsigned short int +csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short len, + unsigned short proto, unsigned int sum) +{ + return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 16) + + ((unsigned long) proto << 8)); +} + +EXPORT_SYMBOL(csum_tcpudp_magic); + +unsigned int +csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len, + unsigned short proto, unsigned int sum) +{ + unsigned long result; + + result = (saddr + daddr + sum + + ((unsigned long) ntohs(len) << 16) + + ((unsigned long) proto << 8)); + + /* Fold down to 32-bits so we don't lose in the typedef-less network stack. */ + /* 64 to 33 */ + result = (result & 0xffffffff) + (result >> 32); + /* 33 to 32 */ + result = (result & 0xffffffff) + (result >> 32); + return result; +} + +extern unsigned long do_csum (const unsigned char *, long); + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 32-bit boundary + */ +unsigned int +csum_partial (const unsigned char * buff, int len, unsigned int sum) +{ + unsigned long result = do_csum(buff, len); + + /* add in old sum, and carry.. */ + result += sum; + /* 32+c bits -> 32 bits */ + result = (result & 0xffffffff) + (result >> 32); + return result; +} + +EXPORT_SYMBOL(csum_partial); + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ +unsigned short +ip_compute_csum (unsigned char * buff, int len) +{ + return ~do_csum(buff,len); +} + +EXPORT_SYMBOL(ip_compute_csum); diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S new file mode 100644 index 00000000000..d4987061dda --- /dev/null +++ b/arch/ia64/lib/clear_page.S @@ -0,0 +1,77 @@ +/* + * Copyright (C) 1999-2002 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com> + * + * 1/06/01 davidm Tuned for Itanium. + * 2/12/02 kchen Tuned for both Itanium and McKinley + * 3/08/02 davidm Some more tweaking + */ +#include <linux/config.h> + +#include <asm/asmmacro.h> +#include <asm/page.h> + +#ifdef CONFIG_ITANIUM +# define L3_LINE_SIZE 64 // Itanium L3 line size +# define PREFETCH_LINES 9 // magic number +#else +# define L3_LINE_SIZE 128 // McKinley L3 line size +# define PREFETCH_LINES 12 // magic number +#endif + +#define saved_lc r2 +#define dst_fetch r3 +#define dst1 r8 +#define dst2 r9 +#define dst3 r10 +#define dst4 r11 + +#define dst_last r31 + +GLOBAL_ENTRY(clear_page) + .prologue + .regstk 1,0,0,0 + mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until + .save ar.lc, saved_lc + mov saved_lc = ar.lc + + .body + mov ar.lc = (PREFETCH_LINES - 1) + mov dst_fetch = in0 + adds dst1 = 16, in0 + adds dst2 = 32, in0 + ;; +.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE + adds dst3 = 48, in0 // executing this multiple times is harmless + br.cloop.sptk.few .fetch + ;; + addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch + mov ar.lc = r16 // one L3 line per iteration + adds dst4 = 64, in0 + ;; +#ifdef CONFIG_ITANIUM + // Optimized for Itanium +1: stf.spill.nta [dst1] = f0, 64 + stf.spill.nta [dst2] = f0, 64 + cmp.lt p8,p0=dst_fetch, dst_last + ;; +#else + // Optimized for McKinley +1: stf.spill.nta [dst1] = f0, 64 + stf.spill.nta [dst2] = f0, 64 + stf.spill.nta [dst3] = f0, 64 + stf.spill.nta [dst4] = f0, 128 + cmp.lt p8,p0=dst_fetch, dst_last + ;; + stf.spill.nta [dst1] = f0, 64 + stf.spill.nta [dst2] = f0, 64 +#endif + stf.spill.nta [dst3] = f0, 64 +(p8) stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE + br.cloop.sptk.few 1b + ;; + mov ar.lc = saved_lc // restore lc + br.ret.sptk.many rp +END(clear_page) diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S new file mode 100644 index 00000000000..eecd8577b20 --- /dev/null +++ b/arch/ia64/lib/clear_user.S @@ -0,0 +1,209 @@ +/* + * This routine clears to zero a linear memory buffer in user space. + * + * Inputs: + * in0: address of buffer + * in1: length of buffer in bytes + * Outputs: + * r8: number of bytes that didn't get cleared due to a fault + * + * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + */ + +#include <asm/asmmacro.h> + +// +// arguments +// +#define buf r32 +#define len r33 + +// +// local registers +// +#define cnt r16 +#define buf2 r17 +#define saved_lc r18 +#define saved_pfs r19 +#define tmp r20 +#define len2 r21 +#define len3 r22 + +// +// Theory of operations: +// - we check whether or not the buffer is small, i.e., less than 17 +// in which case we do the byte by byte loop. +// +// - Otherwise we go progressively from 1 byte store to 8byte store in +// the head part, the body is a 16byte store loop and we finish we the +// tail for the last 15 bytes. +// The good point about this breakdown is that the long buffer handling +// contains only 2 branches. +// +// The reason for not using shifting & masking for both the head and the +// tail is to stay semantically correct. This routine is not supposed +// to write bytes outside of the buffer. While most of the time this would +// be ok, we can't tolerate a mistake. A classical example is the case +// of multithreaded code were to the extra bytes touched is actually owned +// by another thread which runs concurrently to ours. Another, less likely, +// example is with device drivers where reading an I/O mapped location may +// have side effects (same thing for writing). +// + +GLOBAL_ENTRY(__do_clear_user) + .prologue + .save ar.pfs, saved_pfs + alloc saved_pfs=ar.pfs,2,0,0,0 + cmp.eq p6,p0=r0,len // check for zero length + .save ar.lc, saved_lc + mov saved_lc=ar.lc // preserve ar.lc (slow) + .body + ;; // avoid WAW on CFM + adds tmp=-1,len // br.ctop is repeat/until + mov ret0=len // return value is length at this point +(p6) br.ret.spnt.many rp + ;; + cmp.lt p6,p0=16,len // if len > 16 then long memset + mov ar.lc=tmp // initialize lc for small count +(p6) br.cond.dptk .long_do_clear + ;; // WAR on ar.lc + // + // worst case 16 iterations, avg 8 iterations + // + // We could have played with the predicates to use the extra + // M slot for 2 stores/iteration but the cost the initialization + // the various counters compared to how long the loop is supposed + // to last on average does not make this solution viable. + // +1: + EX( .Lexit1, st1 [buf]=r0,1 ) + adds len=-1,len // countdown length using len + br.cloop.dptk 1b + ;; // avoid RAW on ar.lc + // + // .Lexit4: comes from byte by byte loop + // len contains bytes left +.Lexit1: + mov ret0=len // faster than using ar.lc + mov ar.lc=saved_lc + br.ret.sptk.many rp // end of short clear_user + + + // + // At this point we know we have more than 16 bytes to copy + // so we focus on alignment (no branches required) + // + // The use of len/len2 for countdown of the number of bytes left + // instead of ret0 is due to the fact that the exception code + // changes the values of r8. + // +.long_do_clear: + tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) + ;; + EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned +(p6) adds len=-1,len;; // sync because buf is modified + tbit.nz p6,p0=buf,1 + ;; + EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned +(p6) adds len=-2,len;; + tbit.nz p6,p0=buf,2 + ;; + EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned +(p6) adds len=-4,len;; + tbit.nz p6,p0=buf,3 + ;; + EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned +(p6) adds len=-8,len;; + shr.u cnt=len,4 // number of 128-bit (2x64bit) words + ;; + cmp.eq p6,p0=r0,cnt + adds tmp=-1,cnt +(p6) br.cond.dpnt .dotail // we have less than 16 bytes left + ;; + adds buf2=8,buf // setup second base pointer + mov ar.lc=tmp + ;; + + // + // 16bytes/iteration core loop + // + // The second store can never generate a fault because + // we come into the loop only when we are 16-byte aligned. + // This means that if we cross a page then it will always be + // in the first store and never in the second. + // + // + // We need to keep track of the remaining length. A possible (optimistic) + // way would be to use ar.lc and derive how many byte were left by + // doing : left= 16*ar.lc + 16. this would avoid the addition at + // every iteration. + // However we need to keep the synchronization point. A template + // M;;MB does not exist and thus we can keep the addition at no + // extra cycle cost (use a nop slot anyway). It also simplifies the + // (unlikely) error recovery code + // + +2: EX(.Lexit3, st8 [buf]=r0,16 ) + ;; // needed to get len correct when error + st8 [buf2]=r0,16 + adds len=-16,len + br.cloop.dptk 2b + ;; + mov ar.lc=saved_lc + // + // tail correction based on len only + // + // We alternate the use of len3,len2 to allow parallelism and correct + // error handling. We also reuse p6/p7 to return correct value. + // The addition of len2/len3 does not cost anything more compared to + // the regular memset as we had empty slots. + // +.dotail: + mov len2=len // for parallelization of error handling + mov len3=len + tbit.nz p6,p0=len,3 + ;; + EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes +(p6) adds len3=-8,len2 + tbit.nz p7,p6=len,2 + ;; + EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes +(p7) adds len2=-4,len3 + tbit.nz p6,p7=len,1 + ;; + EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes +(p6) adds len3=-2,len2 + tbit.nz p7,p6=len,0 + ;; + EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left + mov ret0=r0 // success + br.ret.sptk.many rp // end of most likely path + + // + // Outlined error handling code + // + + // + // .Lexit3: comes from core loop, need restore pr/lc + // len contains bytes left + // + // + // .Lexit2: + // if p6 -> coming from st8 or st2 : len2 contains what's left + // if p7 -> coming from st4 or st1 : len3 contains what's left + // We must restore lc/pr even though might not have been used. +.Lexit2: + .pred.rel "mutex", p6, p7 +(p6) mov len=len2 +(p7) mov len=len3 + ;; + // + // .Lexit4: comes from head, need not restore pr/lc + // len contains bytes left + // +.Lexit3: + mov ret0=len + mov ar.lc=saved_lc + br.ret.sptk.many rp +END(__do_clear_user) diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S new file mode 100644 index 00000000000..127d1d050d7 --- /dev/null +++ b/arch/ia64/lib/copy_page.S @@ -0,0 +1,98 @@ +/* + * + * Optimized version of the standard copy_page() function + * + * Inputs: + * in0: address of target page + * in1: address of source page + * Output: + * no return value + * + * Copyright (C) 1999, 2001 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger <davidm@hpl.hp.com> + * + * 4/06/01 davidm Tuned to make it perform well both for cached and uncached copies. + */ +#include <asm/asmmacro.h> +#include <asm/page.h> + +#define PIPE_DEPTH 3 +#define EPI p[PIPE_DEPTH-1] + +#define lcount r16 +#define saved_pr r17 +#define saved_lc r18 +#define saved_pfs r19 +#define src1 r20 +#define src2 r21 +#define tgt1 r22 +#define tgt2 r23 +#define srcf r24 +#define tgtf r25 +#define tgt_last r26 + +#define Nrot ((8*PIPE_DEPTH+7)&~7) + +GLOBAL_ENTRY(copy_page) + .prologue + .save ar.pfs, saved_pfs + alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot + + .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \ + t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH] + .rotp p[PIPE_DEPTH] + + .save ar.lc, saved_lc + mov saved_lc=ar.lc + mov ar.ec=PIPE_DEPTH + + mov lcount=PAGE_SIZE/64-1 + .save pr, saved_pr + mov saved_pr=pr + mov pr.rot=1<<16 + + .body + + mov src1=in1 + adds src2=8,in1 + mov tgt_last = PAGE_SIZE + ;; + adds tgt2=8,in0 + add srcf=512,in1 + mov ar.lc=lcount + mov tgt1=in0 + add tgtf=512,in0 + add tgt_last = tgt_last, in0 + ;; +1: +(p[0]) ld8 t1[0]=[src1],16 +(EPI) st8 [tgt1]=t1[PIPE_DEPTH-1],16 +(p[0]) ld8 t2[0]=[src2],16 +(EPI) st8 [tgt2]=t2[PIPE_DEPTH-1],16 + cmp.ltu p6,p0 = tgtf, tgt_last + ;; +(p[0]) ld8 t3[0]=[src1],16 +(EPI) st8 [tgt1]=t3[PIPE_DEPTH-1],16 +(p[0]) ld8 t4[0]=[src2],16 +(EPI) st8 [tgt2]=t4[PIPE_DEPTH-1],16 + ;; +(p[0]) ld8 t5[0]=[src1],16 +(EPI) st8 [tgt1]=t5[PIPE_DEPTH-1],16 +(p[0]) ld8 t6[0]=[src2],16 +(EPI) st8 [tgt2]=t6[PIPE_DEPTH-1],16 + ;; +(p[0]) ld8 t7[0]=[src1],16 +(EPI) st8 [tgt1]=t7[PIPE_DEPTH-1],16 +(p[0]) ld8 t8[0]=[src2],16 +(EPI) st8 [tgt2]=t8[PIPE_DEPTH-1],16 + +(p6) lfetch [srcf], 64 +(p6) lfetch [tgtf], 64 + br.ctop.sptk.few 1b + ;; + mov pr=saved_pr,0xffffffffffff0000 // restore predicates + mov ar.pfs=saved_pfs + mov ar.lc=saved_lc + br.ret.sptk.many rp +END(copy_page) diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S new file mode 100644 index 00000000000..3c45d60a81b --- /dev/null +++ b/arch/ia64/lib/copy_page_mck.S @@ -0,0 +1,185 @@ +/* + * McKinley-optimized version of copy_page(). + * + * Copyright (C) 2002 Hewlett-Packard Co + * David Mosberger <davidm@hpl.hp.com> + * + * Inputs: + * in0: address of target page + * in1: address of source page + * Output: + * no return value + * + * General idea: + * - use regular loads and stores to prefetch data to avoid consuming M-slot just for + * lfetches => good for in-cache performance + * - avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single + * cycle + * + * Principle of operation: + * First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes. + * To avoid secondary misses in L2, we prefetch both source and destination with a line-size + * of 128 bytes. When both of these lines are in the L2 and the first half of the + * source line is in L1, we start copying the remaining words. The second half of the + * source line is prefetched in an earlier iteration, so that by the time we start + * accessing it, it's also present in the L1. + * + * We use a software-pipelined loop to control the overall operation. The pipeline + * has 2*PREFETCH_DIST+K stages. The first PREFETCH_DIST stages are used for prefetching + * source cache-lines. The second PREFETCH_DIST stages are used for prefetching destination + * cache-lines, the last K stages are used to copy the cache-line words not copied by + * the prefetches. The four relevant points in the pipelined are called A, B, C, D: + * p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line + * should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought + * into L1D and p[D] is TRUE if a cacheline needs to be copied. + * + * This all sounds very complicated, but thanks to the modulo-scheduled loop support, + * the resulting code is very regular and quite easy to follow (once you get the idea). + * + * As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented + * as the separate .prefetch_loop. Logically, this loop performs exactly like the + * main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed, + * so that each loop iteration is faster (again, good for cached case). + * + * When reading the code, it helps to keep the following picture in mind: + * + * word 0 word 1 + * +------+------+--- + * | v[x] | t1 | ^ + * | t2 | t3 | | + * | t4 | t5 | | + * | t6 | t7 | | 128 bytes + * | n[y] | t9 | | (L2 cache line) + * | t10 | t11 | | + * | t12 | t13 | | + * | t14 | t15 | v + * +------+------+--- + * + * Here, v[x] is copied by the (memory) prefetch. n[y] is loaded at p[C] + * to fetch the second-half of the L2 cache line into L1, and the tX words are copied in + * an order that avoids bank conflicts. + */ +#include <asm/asmmacro.h> +#include <asm/page.h> + +#define PREFETCH_DIST 8 // McKinley sustains 16 outstanding L2 misses (8 ld, 8 st) + +#define src0 r2 +#define src1 r3 +#define dst0 r9 +#define dst1 r10 +#define src_pre_mem r11 +#define dst_pre_mem r14 +#define src_pre_l2 r15 +#define dst_pre_l2 r16 +#define t1 r17 +#define t2 r18 +#define t3 r19 +#define t4 r20 +#define t5 t1 // alias! +#define t6 t2 // alias! +#define t7 t3 // alias! +#define t9 t5 // alias! +#define t10 t4 // alias! +#define t11 t7 // alias! +#define t12 t6 // alias! +#define t14 t10 // alias! +#define t13 r21 +#define t15 r22 + +#define saved_lc r23 +#define saved_pr r24 + +#define A 0 +#define B (PREFETCH_DIST) +#define C (B + PREFETCH_DIST) +#define D (C + 3) +#define N (D + 1) +#define Nrot ((N + 7) & ~7) + +GLOBAL_ENTRY(copy_page) + .prologue + alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot + + .rotr v[2*PREFETCH_DIST], n[D-C+1] + .rotp p[N] + + .save ar.lc, saved_lc + mov saved_lc = ar.lc + .save pr, saved_pr + mov saved_pr = pr + .body + + mov src_pre_mem = in1 + mov pr.rot = 0x10000 + mov ar.ec = 1 // special unrolled loop + + mov dst_pre_mem = in0 + mov ar.lc = 2*PREFETCH_DIST - 1 + + add src_pre_l2 = 8*8, in1 + add dst_pre_l2 = 8*8, in0 + add src0 = 8, in1 // first t1 src + add src1 = 3*8, in1 // first t3 src + add dst0 = 8, in0 // first t1 dst + add dst1 = 3*8, in0 // first t3 dst + mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1 + nop.m 0 + nop.i 0 + ;; + // same as .line_copy loop, but with all predicated-off instructions removed: +.prefetch_loop: +(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 +(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 + br.ctop.sptk .prefetch_loop + ;; + cmp.eq p16, p0 = r0, r0 // reset p16 to 1 (br.ctop cleared it to zero) + mov ar.lc = t1 // with 64KB pages, t1 is too big to fit in 8 bits! + mov ar.ec = N // # of stages in pipeline + ;; +.line_copy: +(p[D]) ld8 t2 = [src0], 3*8 // M0 +(p[D]) ld8 t4 = [src1], 3*8 // M1 +(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 prefetch dst from memory +(p[D]) st8 [dst_pre_l2] = n[D-C], 128 // M3 prefetch dst from L2 + ;; +(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 prefetch src from memory +(p[C]) ld8 n[0] = [src_pre_l2], 128 // M1 prefetch src from L2 +(p[D]) st8 [dst0] = t1, 8 // M2 +(p[D]) st8 [dst1] = t3, 8 // M3 + ;; +(p[D]) ld8 t5 = [src0], 8 +(p[D]) ld8 t7 = [src1], 3*8 +(p[D]) st8 [dst0] = t2, 3*8 +(p[D]) st8 [dst1] = t4, 3*8 + ;; +(p[D]) ld8 t6 = [src0], 3*8 +(p[D]) ld8 t10 = [src1], 8 +(p[D]) st8 [dst0] = t5, 8 +(p[D]) st8 [dst1] = t7, 3*8 + ;; +(p[D]) ld8 t9 = [src0], 3*8 +(p[D]) ld8 t11 = [src1], 3*8 +(p[D]) st8 [dst0] = t6, 3*8 +(p[D]) st8 [dst1] = t10, 8 + ;; +(p[D]) ld8 t12 = [src0], 8 +(p[D]) ld8 t14 = [src1], 8 +(p[D]) st8 [dst0] = t9, 3*8 +(p[D]) st8 [dst1] = t11, 3*8 + ;; +(p[D]) ld8 t13 = [src0], 4*8 +(p[D]) ld8 t15 = [src1], 4*8 +(p[D]) st8 [dst0] = t12, 8 +(p[D]) st8 [dst1] = t14, 8 + ;; +(p[D-1])ld8 t1 = [src0], 8 +(p[D-1])ld8 t3 = [src1], 8 +(p[D]) st8 [dst0] = t13, 4*8 +(p[D]) st8 [dst1] = t15, 4*8 + br.ctop.sptk .line_copy + ;; + mov ar.lc = saved_lc + mov pr = saved_pr, -1 + br.ret.sptk.many rp +END(copy_page) diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S new file mode 100644 index 00000000000..c952bdc6a09 --- /dev/null +++ b/arch/ia64/lib/copy_user.S @@ -0,0 +1,610 @@ +/* + * + * Optimized version of the copy_user() routine. + * It is used to copy date across the kernel/user boundary. + * + * The source and destination are always on opposite side of + * the boundary. When reading from user space we must catch + * faults on loads. When writing to user space we must catch + * errors on stores. Note that because of the nature of the copy + * we don't need to worry about overlapping regions. + * + * + * Inputs: + * in0 address of source buffer + * in1 address of destination buffer + * in2 number of bytes to copy + * + * Outputs: + * ret0 0 in case of success. The number of bytes NOT copied in + * case of error. + * + * Copyright (C) 2000-2001 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * + * Fixme: + * - handle the case where we have more than 16 bytes and the alignment + * are different. + * - more benchmarking + * - fix extraneous stop bit introduced by the EX() macro. + */ + +#include <asm/asmmacro.h> + +// +// Tuneable parameters +// +#define COPY_BREAK 16 // we do byte copy below (must be >=16) +#define PIPE_DEPTH 21 // pipe depth + +#define EPI p[PIPE_DEPTH |