diff options
Diffstat (limited to 'arch/alpha/lib')
26 files changed, 200 insertions, 1108 deletions
diff --git a/arch/alpha/lib/Makefile b/arch/alpha/lib/Makefile index 21cf624d732..59660743237 100644 --- a/arch/alpha/lib/Makefile +++ b/arch/alpha/lib/Makefile @@ -2,8 +2,8 @@ # Makefile for alpha-specific library files.. # -EXTRA_AFLAGS := $(CFLAGS) -EXTRA_CFLAGS := -Werror +asflags-y := $(KBUILD_CFLAGS) +ccflags-y := -Werror # Many of these routines have implementations tuned for ev6. # Choose them iff we're targeting ev6 specifically. @@ -31,14 +31,12 @@ lib-y = __divqu.o __remqu.o __divlu.o __remlu.o \ $(ev6-y)memchr.o \ $(ev6-y)copy_user.o \ $(ev6-y)clear_user.o \ - $(ev6-y)strncpy_from_user.o \ - $(ev67-y)strlen_user.o \ $(ev6-y)csum_ipv6_magic.o \ $(ev6-y)clear_page.o \ $(ev6-y)copy_page.o \ - strcasecmp.o \ fpreg.o \ - callback_srm.o srm_puts.o srm_printk.o + callback_srm.o srm_puts.o srm_printk.o \ + fls.o lib-$(CONFIG_SMP) += dec_and_lock.o diff --git a/arch/alpha/lib/callback_srm.S b/arch/alpha/lib/callback_srm.S index 0528acd0d9a..8804bec2c64 100644 --- a/arch/alpha/lib/callback_srm.S +++ b/arch/alpha/lib/callback_srm.S @@ -2,7 +2,6 @@ * arch/alpha/lib/callback_srm.S */ -#include <linux/config.h> #include <asm/console.h> .text diff --git a/arch/alpha/lib/checksum.c b/arch/alpha/lib/checksum.c index 89044e6385f..199f6efa83f 100644 --- a/arch/alpha/lib/checksum.c +++ b/arch/alpha/lib/checksum.c @@ -5,7 +5,7 @@ * in an architecture-specific manner due to speed.. * Comments in other versions indicate that the algorithms are from RFC1071 * - * accellerated versions (and 21264 assembly versions ) contributed by + * accelerated versions (and 21264 assembly versions ) contributed by * Rick Gorton <rick.gorton@alpha-processor.com> */ @@ -41,28 +41,25 @@ static inline unsigned short from64to16(unsigned long x) * computes the checksum of the TCP/UDP pseudo-header * returns a 16-bit checksum, already complemented. */ -unsigned short int csum_tcpudp_magic(unsigned long saddr, - unsigned long daddr, +__sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len, unsigned short proto, - unsigned int sum) + __wsum sum) { - return ~from64to16(saddr + daddr + sum + - ((unsigned long) ntohs(len) << 16) + - ((unsigned long) proto << 8)); + return (__force __sum16)~from64to16( + (__force u64)saddr + (__force u64)daddr + + (__force u64)sum + ((len + proto) << 8)); } -unsigned int csum_tcpudp_nofold(unsigned long saddr, - unsigned long daddr, +__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len, unsigned short proto, - unsigned int sum) + __wsum sum) { unsigned long result; - result = (saddr + daddr + sum + - ((unsigned long) ntohs(len) << 16) + - ((unsigned long) proto << 8)); + result = (__force u64)saddr + (__force u64)daddr + + (__force u64)sum + ((len + proto) << 8); /* Fold down to 32-bits so we don't lose in the typedef-less network stack. */ @@ -70,8 +67,9 @@ unsigned int csum_tcpudp_nofold(unsigned long saddr, result = (result & 0xffffffff) + (result >> 32); /* 33 to 32 */ result = (result & 0xffffffff) + (result >> 32); - return result; + return (__force __wsum)result; } +EXPORT_SYMBOL(csum_tcpudp_nofold); /* * Do a 64-bit checksum on an arbitrary memory area.. @@ -146,9 +144,9 @@ out: * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. */ -unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl) +__sum16 ip_fast_csum(const void *iph, unsigned int ihl) { - return ~do_csum(iph,ihl*4); + return (__force __sum16)~do_csum(iph,ihl*4); } /* @@ -163,15 +161,15 @@ unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl) * * it's best to have buff aligned on a 32-bit boundary */ -unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) +__wsum csum_partial(const void *buff, int len, __wsum sum) { unsigned long result = do_csum(buff, len); /* add in old sum, and carry.. */ - result += sum; + result += (__force u32)sum; /* 32+c bits -> 32 bits */ result = (result & 0xffffffff) + (result >> 32); - return result; + return (__force __wsum)result; } EXPORT_SYMBOL(csum_partial); @@ -180,7 +178,7 @@ EXPORT_SYMBOL(csum_partial); * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ -unsigned short ip_compute_csum(unsigned char * buff, int len) +__sum16 ip_compute_csum(const void *buff, int len) { - return ~from64to16(do_csum(buff,len)); + return (__force __sum16)~from64to16(do_csum(buff,len)); } diff --git a/arch/alpha/lib/csum_ipv6_magic.S b/arch/alpha/lib/csum_ipv6_magic.S index e09748dbf2e..2c2acb96deb 100644 --- a/arch/alpha/lib/csum_ipv6_magic.S +++ b/arch/alpha/lib/csum_ipv6_magic.S @@ -7,6 +7,9 @@ * __u32 len, * unsigned short proto, * unsigned int csum); + * + * Misalignment handling (which costs 16 instructions / 8 cycles) + * added by Ivan Kokshaysky <ink@jurassic.park.msu.ru> */ .globl csum_ipv6_magic @@ -16,37 +19,57 @@ csum_ipv6_magic: .prologue 0 - ldq $0,0($16) # e0 : load src & dst addr words + ldq_u $0,0($16) # e0 : load src & dst addr words zapnot $20,15,$20 # .. e1 : zero extend incoming csum extqh $18,1,$4 # e0 : byte swap len & proto while we wait - ldq $1,8($16) # .. e1 : + ldq_u $21,7($16) # .. e1 : handle misalignment extbl $18,1,$5 # e0 : - ldq $2,0($17) # .. e1 : + ldq_u $1,8($16) # .. e1 : extbl $18,2,$6 # e0 : - ldq $3,8($17) # .. e1 : + ldq_u $22,15($16) # .. e1 : extbl $18,3,$18 # e0 : + ldq_u $2,0($17) # .. e1 : sra $4,32,$4 # e0 : + ldq_u $23,7($17) # .. e1 : + + extql $0,$16,$0 # e0 : + ldq_u $3,8($17) # .. e1 : + extqh $21,$16,$21 # e0 : + ldq_u $24,15($17) # .. e1 : + sll $5,16,$5 # e0 : + or $0,$21,$0 # .. e1 : 1st src word complete + extql $1,$16,$1 # e0 : addq $20,$0,$20 # .. e1 : begin summing the words - sll $6,8,$6 # e0 : + extqh $22,$16,$22 # e0 : cmpult $20,$0,$0 # .. e1 : - extwh $19,7,$7 # e0 : - or $4,$18,$18 # .. e1 : + sll $6,8,$6 # e0 : + or $1,$22,$1 # .. e1 : 2nd src word complete - extbl $19,1,$19 # e0 : + extql $2,$17,$2 # e0 : + or $4,$18,$18 # .. e1 : + extqh $23,$17,$23 # e0 : or $5,$6,$5 # .. e1 : - or $18,$5,$18 # e0 : len complete - or $19,$7,$19 # .. e1 : - sll $19,48,$19 # e0 : + extql $3,$17,$3 # e0 : + or $2,$23,$2 # .. e1 : 1st dst word complete + extqh $24,$17,$24 # e0 : + or $18,$5,$18 # .. e1 : len complete + + extwh $19,7,$7 # e0 : + or $3,$24,$3 # .. e1 : 2nd dst word complete + extbl $19,1,$19 # e0 : addq $20,$1,$20 # .. e1 : - sra $19,32,$19 # e0 : proto complete + + or $19,$7,$19 # e0 : cmpult $20,$1,$1 # .. e1 : + sll $19,48,$19 # e0 : + nop # .. e0 : - nop # e0 : + sra $19,32,$19 # e0 : proto complete addq $20,$2,$20 # .. e1 : cmpult $20,$2,$2 # e0 : addq $20,$3,$20 # .. e1 : @@ -84,7 +107,7 @@ csum_ipv6_magic: extwl $0,2,$1 # e0 : fold 17-bit value zapnot $0,3,$0 # .. e1 : addq $0,$1,$0 # e0 : - not $0,$0 # e1 : and complement. + not $0,$0 # .. e1 : and complement. zapnot $0,3,$0 # e0 : ret # .. e1 : diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c index a37948f3037..5675dca8dbb 100644 --- a/arch/alpha/lib/csum_partial_copy.c +++ b/arch/alpha/lib/csum_partial_copy.c @@ -2,7 +2,7 @@ * csum_partial_copy - do IP checksumming and copy * * (C) Copyright 1996 Linus Torvalds - * accellerated versions (and 21264 assembly versions ) contributed by + * accelerated versions (and 21264 assembly versions ) contributed by * Rick Gorton <rick.gorton@alpha-processor.com> * * Don't look at this too closely - you'll go mad. The things @@ -130,7 +130,7 @@ csum_partial_cfu_aligned(const unsigned long __user *src, unsigned long *dst, *dst = word | tmp; checksum += carry; } - if (err) *errp = err; + if (err && errp) *errp = err; return checksum; } @@ -185,7 +185,7 @@ csum_partial_cfu_dest_aligned(const unsigned long __user *src, *dst = word | tmp; checksum += carry; } - if (err) *errp = err; + if (err && errp) *errp = err; return checksum; } @@ -242,7 +242,7 @@ csum_partial_cfu_src_aligned(const unsigned long __user *src, stq_u(partial_dest | second_dest, dst); out: checksum += carry; - if (err) *errp = err; + if (err && errp) *errp = err; return checksum; } @@ -325,19 +325,24 @@ csum_partial_cfu_unaligned(const unsigned long __user * src, stq_u(partial_dest | word | second_dest, dst); checksum += carry; } - if (err) *errp = err; + if (err && errp) *errp = err; return checksum; } -static unsigned int -do_csum_partial_copy_from_user(const char __user *src, char *dst, int len, - unsigned int sum, int *errp) +__wsum +csum_partial_copy_from_user(const void __user *src, void *dst, int len, + __wsum sum, int *errp) { - unsigned long checksum = (unsigned) sum; + unsigned long checksum = (__force u32) sum; unsigned long soff = 7 & (unsigned long) src; unsigned long doff = 7 & (unsigned long) dst; if (len) { + if (!access_ok(VERIFY_READ, src, len)) { + if (errp) *errp = -EFAULT; + memset(dst, 0, len); + return sum; + } if (!doff) { if (!soff) checksum = csum_partial_cfu_aligned( @@ -367,25 +372,17 @@ do_csum_partial_copy_from_user(const char __user *src, char *dst, int len, } checksum = from64to16 (checksum); } - return checksum; -} - -unsigned int -csum_partial_copy_from_user(const char __user *src, char *dst, int len, - unsigned int sum, int *errp) -{ - if (!access_ok(VERIFY_READ, src, len)) { - *errp = -EFAULT; - memset(dst, 0, len); - return sum; - } - - return do_csum_partial_copy_from_user(src, dst, len, sum, errp); + return (__force __wsum)checksum; } -unsigned int -csum_partial_copy_nocheck(const char __user *src, char *dst, int len, - unsigned int sum) +__wsum +csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) { - return do_csum_partial_copy_from_user(src, dst, len, sum, NULL); + __wsum checksum; + mm_segment_t oldfs = get_fs(); + set_fs(KERNEL_DS); + checksum = csum_partial_copy_from_user((__force const void __user *)src, + dst, len, sum, NULL); + set_fs(oldfs); + return checksum; } diff --git a/arch/alpha/lib/dbg_stackcheck.S b/arch/alpha/lib/dbg_stackcheck.S index 3c1f3e6522e..78f6b924ad8 100644 --- a/arch/alpha/lib/dbg_stackcheck.S +++ b/arch/alpha/lib/dbg_stackcheck.S @@ -1,5 +1,5 @@ /* - * arch/alpha/lib/stackcheck.S + * arch/alpha/lib/dbg_stackcheck.S * Contributed by Richard Henderson (rth@tamu.edu) * * Verify that we have not overflowed the stack. Oops if we have. diff --git a/arch/alpha/lib/dbg_stackkill.S b/arch/alpha/lib/dbg_stackkill.S index e9f6a9dcf2b..c1e40a1a43d 100644 --- a/arch/alpha/lib/dbg_stackkill.S +++ b/arch/alpha/lib/dbg_stackkill.S @@ -1,5 +1,5 @@ /* - * arch/alpha/lib/killstack.S + * arch/alpha/lib/dbg_stackkill.S * Contributed by Richard Henderson (rth@cygnus.com) * * Clobber the balance of the kernel stack, hoping to catch diff --git a/arch/alpha/lib/dec_and_lock.c b/arch/alpha/lib/dec_and_lock.c index 6ae2500a9d9..f9f5fe830e9 100644 --- a/arch/alpha/lib/dec_and_lock.c +++ b/arch/alpha/lib/dec_and_lock.c @@ -6,7 +6,7 @@ */ #include <linux/spinlock.h> -#include <asm/atomic.h> +#include <linux/atomic.h> asm (".text \n\ .global _atomic_dec_and_lock \n\ @@ -30,8 +30,7 @@ _atomic_dec_and_lock: \n\ .previous \n\ .end _atomic_dec_and_lock"); -static int __attribute_used__ -atomic_dec_and_lock_1(atomic_t *atomic, spinlock_t *lock) +static int __used atomic_dec_and_lock_1(atomic_t *atomic, spinlock_t *lock) { /* Slow path */ spin_lock(lock); diff --git a/arch/alpha/lib/ev6-csum_ipv6_magic.S b/arch/alpha/lib/ev6-csum_ipv6_magic.S index de1948a6911..fc0bc399f87 100644 --- a/arch/alpha/lib/ev6-csum_ipv6_magic.S +++ b/arch/alpha/lib/ev6-csum_ipv6_magic.S @@ -46,6 +46,10 @@ * add the 3 low ushorts together, generating a uint * a final add of the 2 lower ushorts * truncating the result. + * + * Misalignment handling added by Ivan Kokshaysky <ink@jurassic.park.msu.ru> + * The cost is 16 instructions (~8 cycles), including two extra loads which + * may cause additional delay in rare cases (load-load replay traps). */ .globl csum_ipv6_magic @@ -55,25 +59,45 @@ csum_ipv6_magic: .prologue 0 - ldq $0,0($16) # L : Latency: 3 + ldq_u $0,0($16) # L : Latency: 3 inslh $18,7,$4 # U : 0000000000AABBCC - ldq $1,8($16) # L : Latency: 3 + ldq_u $1,8($16) # L : Latency: 3 sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00 + and $16,7,$6 # E : src misalignment + ldq_u $5,15($16) # L : Latency: 3 zapnot $20,15,$20 # U : zero extend incoming csum - ldq $2,0($17) # L : Latency: 3 - sll $19,24,$19 # U : U L L U : 0x000000aa bb000000 + ldq_u $2,0($17) # L : U L U L : Latency: 3 + + extql $0,$6,$0 # U : + extqh $1,$6,$22 # U : + ldq_u $3,8($17) # L : Latency: 3 + sll $19,24,$19 # U : U U L U : 0x000000aa bb000000 + + cmoveq $6,$31,$22 # E : src aligned? + ldq_u $23,15($17) # L : Latency: 3 inswl $18,3,$18 # U : 000000CCDD000000 + addl $19,$7,$19 # E : U L U L : <sign bits>bbaabb00 - ldq $3,8($17) # L : Latency: 3 - bis $18,$4,$18 # E : 000000CCDDAABBCC - addl $19,$7,$19 # E : <sign bits>bbaabb00 - nop # E : U L U L + or $0,$22,$0 # E : 1st src word complete + extql $1,$6,$1 # U : + or $18,$4,$18 # E : 000000CCDDAABBCC + extqh $5,$6,$5 # U : L U L U + and $17,7,$6 # E : dst misalignment + extql $2,$6,$2 # U : + or $1,$5,$1 # E : 2nd src word complete + extqh $3,$6,$22 # U : L U L U : + + cmoveq $6,$31,$22 # E : dst aligned? + extql $3,$6,$3 # U : addq $20,$0,$20 # E : begin summing the words + extqh $23,$6,$23 # U : L U L U : + srl $18,16,$4 # U : 0000000000CCDDAA + or $2,$22,$2 # E : 1st dst word complete zap $19,0x3,$19 # U : <sign bits>bbaa0000 - nop # E : L U U L + or $3,$23,$3 # E : U L U L : 2nd dst word complete cmpult $20,$0,$0 # E : addq $20,$1,$20 # E : diff --git a/arch/alpha/lib/ev6-memchr.S b/arch/alpha/lib/ev6-memchr.S index a8e843dbcc2..1a5f71b9d8b 100644 --- a/arch/alpha/lib/ev6-memchr.S +++ b/arch/alpha/lib/ev6-memchr.S @@ -84,7 +84,7 @@ $last_quad: beq $2, $not_found # U : U L U L $found_it: -#if defined(__alpha_fix__) && defined(__alpha_cix__) +#ifdef CONFIG_ALPHA_EV67 /* * Since we are guaranteed to have set one of the bits, we don't * have to worry about coming back with a 0x40 out of cttz... diff --git a/arch/alpha/lib/ev6-memset.S b/arch/alpha/lib/ev6-memset.S index d8b94e1c7fc..356bb2fdd70 100644 --- a/arch/alpha/lib/ev6-memset.S +++ b/arch/alpha/lib/ev6-memset.S @@ -30,14 +30,15 @@ .set noat .set noreorder .text + .globl memset .globl __memset + .globl ___memset .globl __memsetw .globl __constant_c_memset - .globl memset - .ent __memset + .ent ___memset .align 5 -__memset: +___memset: .frame $30,0,$26,0 .prologue 0 @@ -227,7 +228,7 @@ end_b: nop nop ret $31,($26),1 # L0 : - .end __memset + .end ___memset /* * This is the original body of code, prior to replication and @@ -594,4 +595,5 @@ end_w: .end __memsetw -memset = __memset +memset = ___memset +__memset = ___memset diff --git a/arch/alpha/lib/ev6-strncpy_from_user.S b/arch/alpha/lib/ev6-strncpy_from_user.S deleted file mode 100644 index d2e28178cac..00000000000 --- a/arch/alpha/lib/ev6-strncpy_from_user.S +++ /dev/null @@ -1,424 +0,0 @@ -/* - * arch/alpha/lib/ev6-strncpy_from_user.S - * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> - * - * Just like strncpy except in the return value: - * - * -EFAULT if an exception occurs before the terminator is copied. - * N if the buffer filled. - * - * Otherwise the length of the string is returned. - * - * Much of the information about 21264 scheduling/coding comes from: - * Compiler Writer's Guide for the Alpha 21264 - * abbreviated as 'CWG' in other comments here - * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html - * Scheduling notation: - * E - either cluster - * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 - * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 - * A bunch of instructions got moved and temp registers were changed - * to aid in scheduling. Control flow was also re-arranged to eliminate - * branches, and to provide longer code sequences to enable better scheduling. - * A total rewrite (using byte load/stores for start & tail sequences) - * is desirable, but very difficult to do without a from-scratch rewrite. - * Save that for the future. - */ - - -#include <asm/errno.h> -#include <asm/regdef.h> - - -/* Allow an exception for an insn; exit if we get one. */ -#define EX(x,y...) \ - 99: x,##y; \ - .section __ex_table,"a"; \ - .long 99b - .; \ - lda $31, $exception-99b($0); \ - .previous - - - .set noat - .set noreorder - .text - - .globl __strncpy_from_user - .ent __strncpy_from_user - .frame $30, 0, $26 - .prologue 0 - - .align 4 -__strncpy_from_user: - and a0, 7, t3 # E : find dest misalignment - beq a2, $zerolength # U : - - /* Are source and destination co-aligned? */ - mov a0, v0 # E : save the string start - xor a0, a1, t4 # E : - EX( ldq_u t1, 0(a1) ) # L : Latency=3 load first quadword - ldq_u t0, 0(a0) # L : load first (partial) aligned dest quadword - - addq a2, t3, a2 # E : bias count by dest misalignment - subq a2, 1, a3 # E : - addq zero, 1, t10 # E : - and t4, 7, t4 # E : misalignment between the two - - and a3, 7, t6 # E : number of tail bytes - sll t10, t6, t10 # E : t10 = bitmask of last count byte - bne t4, $unaligned # U : - lda t2, -1 # E : build a mask against false zero - - /* - * We are co-aligned; take care of a partial first word. - * On entry to this basic block: - * t0 == the first destination word for masking back in - * t1 == the first source word. - */ - - srl a3, 3, a2 # E : a2 = loop counter = (count - 1)/8 - addq a1, 8, a1 # E : - mskqh t2, a1, t2 # U : detection in the src word - nop - - /* Create the 1st output word and detect 0's in the 1st input word. */ - mskqh t1, a1, t3 # U : - mskql t0, a1, t0 # U : assemble the first output word - ornot t1, t2, t2 # E : - nop - - cmpbge zero, t2, t8 # E : bits set iff null found - or t0, t3, t0 # E : - beq a2, $a_eoc # U : - bne t8, $a_eos # U : 2nd branch in a quad. Bad. - - /* On entry to this basic block: - * t0 == a source quad not containing a null. - * a0 - current aligned destination address - * a1 - current aligned source address - * a2 - count of quadwords to move. - * NOTE: Loop improvement - unrolling this is going to be - * a huge win, since we're going to stall otherwise. - * Fix this later. For _really_ large copies, look - * at using wh64 on a look-ahead basis. See the code - * in clear_user.S and copy_user.S. - * Presumably, since (a0) and (a1) do not overlap (by C definition) - * Lots of nops here: - * - Separate loads from stores - * - Keep it to 1 branch/quadpack so the branch predictor - * can train. - */ -$a_loop: - stq_u t0, 0(a0) # L : - addq a0, 8, a0 # E : - nop - subq a2, 1, a2 # E : - - EX( ldq_u t0, 0(a1) ) # L : - addq a1, 8, a1 # E : - cmpbge zero, t0, t8 # E : Stall 2 cycles on t0 - beq a2, $a_eoc # U : - - beq t8, $a_loop # U : - nop - nop - nop - - /* Take care of the final (partial) word store. At this point - * the end-of-count bit is set in t8 iff it applies. - * - * On entry to this basic block we have: - * t0 == the source word containing the null - * t8 == the cmpbge mask that found it. - */ -$a_eos: - negq t8, t12 # E : find low bit set - and t8, t12, t12 # E : - - /* We're doing a partial word store and so need to combine - our source and original destination words. */ - ldq_u t1, 0(a0) # L : - subq t12, 1, t6 # E : - - or t12, t6, t8 # E : - zapnot t0, t8, t0 # U : clear src bytes > null - zap t1, t8, t1 # U : clear dst bytes <= null - or t0, t1, t0 # E : - - stq_u t0, 0(a0) # L : - br $finish_up # L0 : - nop - nop - - /* Add the end-of-count bit to the eos detection bitmask. */ - .align 4 -$a_eoc: - or t10, t8, t8 - br $a_eos - nop - nop - - -/* The source and destination are not co-aligned. Align the destination - and cope. We have to be very careful about not reading too much and - causing a SEGV. */ - - .align 4 -$u_head: - /* We know just enough now to be able to assemble the first - full source word. We can still find a zero at the end of it - that prevents us from outputting the whole thing. - - On entry to this basic block: - t0 == the first dest word, unmasked - t1 == the shifted low bits of the first source word - t6 == bytemask that is -1 in dest word bytes */ - - EX( ldq_u t2, 8(a1) ) # L : load second src word - addq a1, 8, a1 # E : - mskql t0, a0, t0 # U : mask trailing garbage in dst - extqh t2, a1, t4 # U : - - or t1, t4, t1 # E : first aligned src word complete - mskqh t1, a0, t1 # U : mask leading garbage in src - or t0, t1, t0 # E : first output word complete - or t0, t6, t6 # E : mask original data for zero test - - cmpbge zero, t6, t8 # E : - beq a2, $u_eocfin # U : - bne t8, $u_final # U : bad news - 2nd branch in a quad - lda t6, -1 # E : mask out the bits we have - - mskql t6, a1, t6 # U : already seen - stq_u t0, 0(a0) # L : store first output word - or t6, t2, t2 # E : - cmpbge zero, t2, t8 # E : find nulls in second partial - - addq a0, 8, a0 # E : - subq a2, 1, a2 # E : - bne t8, $u_late_head_exit # U : - nop - - /* Finally, we've got all the stupid leading edge cases taken care - of and we can set up to enter the main loop. */ - - extql t2, a1, t1 # U : position hi-bits of lo word - EX( ldq_u t2, 8(a1) ) # L : read next high-order source word - addq a1, 8, a1 # E : - cmpbge zero, t2, t8 # E : - - beq a2, $u_eoc # U : - bne t8, $u_eos # U : - nop - nop - - /* Unaligned copy main loop. In order to avoid reading too much, - the loop is structured to detect zeros in aligned source words. - This has, unfortunately, effectively pulled half of a loop - iteration out into the head and half into the tail, but it does - prevent nastiness from accumulating in the very thing we want - to run as fast as possible. - - On entry to this basic block: - t1 == the shifted high-order bits from the previous source word - t2 == the unshifted current source word - - We further know that t2 does not contain a null terminator. */ - - /* - * Extra nops here: - * separate load quads from store quads - * only one branch/quad to permit predictor training - */ - - .align 4 -$u_loop: - extqh t2, a1, t0 # U : extract high bits for current word - addq a1, 8, a1 # E : - extql t2, a1, t3 # U : extract low bits for next time - addq a0, 8, a0 # E : - - or t0, t1, t0 # E : current dst word now complete - EX( ldq_u t2, 0(a1) ) # L : load high word for next time - subq a2, 1, a2 # E : - nop - - stq_u t0, -8(a0) # L : save the current word - mov t3, t1 # E : - cmpbge zero, t2, t8 # E : test new word for eos - beq a2, $u_eoc # U : - - beq t8, $u_loop # U : - nop - nop - nop - - /* We've found a zero somewhere in the source word we just read. - If it resides in the lower half, we have one (probably partial) - word to write out, and if it resides in the upper half, we - have one full and one partial word left to write out. - - On entry to this basic block: - t1 == the shifted high-order bits from the previous source word - t2 == the unshifted current source word. */ - .align 4 -$u_eos: - extqh t2, a1, t0 # U : - or t0, t1, t0 # E : first (partial) source word complete - cmpbge zero, t0, t8 # E : is the null in this first bit? - nop - - bne t8, $u_final # U : - stq_u t0, 0(a0) # L : the null was in the high-order bits - addq a0, 8, a0 # E : - subq a2, 1, a2 # E : - - .align 4 -$u_late_head_exit: - extql t2, a1, t0 # U : - cmpbge zero, t0, t8 # E : - or t8, t10, t6 # E : - cmoveq a2, t6, t8 # E : - - /* Take care of a final (probably partial) result word. - On entry to this basic block: - t0 == assembled source word - t8 == cmpbge mask that found the null. */ - .align 4 -$u_final: - negq t8, t6 # E : isolate low bit set - and t6, t8, t12 # E : - ldq_u t1, 0(a0) # L : - subq t12, 1, t6 # E : - - or t6, t12, t8 # E : - zapnot t0, t8, t0 # U : kill source bytes > null - zap t1, t8, t1 # U : kill dest bytes <= null - or t0, t1, t0 # E : - - stq_u t0, 0(a0) # E : - br $finish_up # U : - nop - nop - - .align 4 -$u_eoc: # end-of-count - extqh t2, a1, t0 # U : - or t0, t1, t0 # E : - cmpbge zero, t0, t8 # E : - nop - - .align 4 -$u_eocfin: # end-of-count, final word - or t10, t8, t8 # E : - br $u_final # U : - nop - nop - - /* Unaligned copy entry point. */ - .align 4 -$unaligned: - - srl a3, 3, a2 # U : a2 = loop counter = (count - 1)/8 - and a0, 7, t4 # E : find dest misalignment - and a1, 7, t5 # E : find src misalignment - mov zero, t0 # E : - - /* Conditionally load the first destination word and a bytemask - with 0xff indicating that the destination byte is sacrosanct. */ - - mov zero, t6 # E : - beq t4, 1f # U : - ldq_u t0, 0(a0) # L : - lda t6, -1 # E : - - mskql t6, a0, t6 # E : - nop - nop - nop - - .align 4 -1: - subq a1, t4, a1 # E : sub dest misalignment from src addr - /* If source misalignment is larger than dest misalignment, we need - extra startup checks to avoid SEGV. */ - cmplt t4, t5, t12 # E : - extql t1, a1, t1 # U : shift src into place - lda t2, -1 # E : for creating masks later - - beq t12, $u_head # U : - mskqh t2, t5, t2 # U : begin src byte validity mask - cmpbge zero, t1, t8 # E : is there a zero? - nop - - extql t2, a1, t2 # U : - or t8, t10, t5 # E : test for end-of-count too - cmpbge zero, t2, t3 # E : - cmoveq a2, t5, t8 # E : Latency=2, extra map slot - - nop # E : goes with cmov - andnot t8, t3, t8 # E : - beq t8, $u_head # U : - nop - - /* At this point we've found a zero in the first partial word of - the source. We need to isolate the valid source data and mask - it into the original destination data. (Incidentally, we know - that we'll need at least one byte of that original dest word.) */ - - ldq_u t0, 0(a0) # L : - negq t8, t6 # E : build bitmask of bytes <= zero - mskqh t1, t4, t1 # U : - and t6, t8, t12 # E : - - subq t12, 1, t6 # E : - or t6, t12, t8 # E : - zapnot t2, t8, t2 # U : prepare source word; mirror changes - zapnot t1, t8, t1 # U : to source validity mask - - andnot t0, t2, t0 # E : zero place for source to reside - or t0, t1, t0 # E : and put it there - stq_u t0, 0(a0) # L : - nop - - .align 4 -$finish_up: - zapnot t0, t12, t4 # U : was last byte written null? - and t12, 0xf0, t3 # E : binary search for the address of the - cmovne t4, 1, t4 # E : Latency=2, extra map slot - nop # E : with cmovne - - and t12, 0xcc, t2 # E : last byte written - and t12, 0xaa, t1 # E : - cmovne t3, 4, t3 # E : Latency=2, extra map slot - nop # E : with cmovne - - bic a0, 7, t0 - cmovne t2, 2, t2 # E : Latency=2, extra map slot - nop # E : with cmovne - nop - - cmovne t1, 1, t1 # E : Latency=2, extra map slot - nop # E : with cmovne - addq t0, t3, t0 # E : - addq t1, t2, t1 # E : - - addq t0, t1, t0 # E : - addq t0, t4, t0 # add one if we filled the buffer - subq t0, v0, v0 # find string length - ret # L0 : - - .align 4 -$zerolength: - nop - nop - nop - clr v0 - -$exception: - nop - nop - nop - ret - - .end __strncpy_from_user diff --git a/arch/alpha/lib/ev6-stxncpy.S b/arch/alpha/lib/ev6-stxncpy.S index b581a7af245..1aa6e97e04b 100644 --- a/arch/alpha/lib/ev6-stxncpy.S +++ b/arch/alpha/lib/ev6-stxncpy.S @@ -362,10 +362,10 @@ $unaligned: extql t2, a1, t2 # U : cmpbge zero, t1, t8 # E : is there a zero? - andnot t2, t6, t12 # E : dest mask for a single word copy + andnot t2, t6, t2 # E : dest mask for a single word copy or t8, t10, t5 # E : test for end-of-count too - cmpbge zero, t12, t3 # E : + cmpbge zero, t2, t3 # E : cmoveq a2, t5, t8 # E : Latency=2, extra map slot nop # E : keep with cmoveq andnot t8, t3, t8 # E : (stall) @@ -379,13 +379,13 @@ $unaligned: negq t8, t6 # E : build bitmask of bytes <= zero mskqh t1, t4, t1 # U : - and t6, t8, t2 # E : - subq t2, 1, t6 # E : (stall) - or t6, t2, t8 # E : (stall) - zapnot t12, t8, t12 # U : prepare source word; mirror changes (stall) + and t6, t8, t12 # E : + subq t12, 1, t6 # E : (stall) + or t6, t12, t8 # E : (stall) + zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall) zapnot t1, t8, t1 # U : to source validity mask - andnot t0, t12, t0 # E : zero place for source to reside + andnot t0, t2, t0 # E : zero place for source to reside or t0, t1, t0 # E : and put it there (stall both t0, t1) stq_u t0, 0(a0) # L : (stall) diff --git a/arch/alpha/lib/ev67-strlen_user.S b/arch/alpha/lib/ev67-strlen_user.S deleted file mode 100644 index 57e0d77b81a..00000000000 --- a/arch/alpha/lib/ev67-strlen_user.S +++ /dev/null @@ -1,107 +0,0 @@ -/* - * arch/alpha/lib/ev67-strlen_user.S - * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com> - * - * Return the length of the string including the NULL terminator - * (strlen+1) or zero if an error occurred. - * - * In places where it is critical to limit the processing time, - * and the data is not trusted, strnlen_user() should be used. - * It will return a value greater than its second argument if - * that limit would be exceeded. This implementation is allowed - * to access memory beyond the limit, but will not cross a page - * boundary when doing so. - * - * Much of the information about 21264 scheduling/coding comes from: - * Compiler Writer's Guide for the Alpha 21264 - * abbreviated as 'CWG' in other comments here - * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html - * Scheduling notation: - * E - either cluster - * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 - * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 - * Try not to change the actual algorithm if possible for consistency. - */ - -#include <asm/regdef.h> - - -/* Allow an exception for an insn; exit if we get one. */ -#define EX(x,y...) \ - 99: x,##y; \ - .section __ex_table,"a"; \ - .long 99b - .; \ - lda v0, $exception-99b(zero); \ - .previous - - - .set noreorder - .set noat - .text - - .globl __strlen_user - .ent __strlen_user - .frame sp, 0, ra - - .align 4 -__strlen_user: - ldah a1, 32767(zero) # do not use plain strlen_user() for strings - # that might be almost 2 GB long; you should - # be using strnlen_user() instead - nop - nop - nop - - .globl __strnlen_user - - .align 4 -__strnlen_user: - .prologue 0 - EX( ldq_u t0, 0(a0) ) # L : load first quadword (a0 may be misaligned) - lda t1, -1(zero) # E : - - insqh t1, a0, t1 # U : - andnot a0, 7, v0 # E : - or t1, t0, t0 # E : - subq a0, 1, a0 # E : get our +1 for the return - - cmpbge zero, t0, t1 # E : t1 <- bitmask: bit i == 1 <==> i-th byte == 0 - subq a1, 7, t2 # E : - subq a0, v0, t0 # E : - bne t1, $found # U : - - addq t2, t0, t2 # E : - addq a1, 1, a1 # E : - nop # E : - nop # E : - - .align 4 -$loop: ble t2, $limit # U : - EX( ldq t0, 8(v0) ) # L : - nop # E : - nop # E : - - cmpbge zero, t0, t1 # E : - subq t2, 8, t2 # E : - addq v0, 8, v0 # E : addr += 8 - beq t1, $loop # U : - -$found: cttz t1, t2 # U0 : - addq v0, t2, v0 # E : - subq v0, a0, v0 # E : - ret # L0 : - -$exception: - nop - nop - nop - ret - - .align 4 # currently redundant -$limit: - nop - nop - subq a1, t2, v0 - ret - - .end __strlen_user diff --git a/arch/alpha/lib/ev67-strrchr.S b/arch/alpha/lib/ev67-strrchr.S index 3fd8bf414c7..dd0d8c6b9f5 100644 --- a/arch/alpha/lib/ev67-strrchr.S +++ b/arch/alpha/lib/ev67-strrchr.S @@ -82,7 +82,7 @@ $loop: $eos: negq t1, t4 # E : isolate first null byte match and t1, t4, t4 # E : - subq t4, 1, t5 # E : build a mask of the bytes upto... + subq t4, 1, t5 # E : build a mask of the bytes up to... or t4, t5, t4 # E : ... and including the null and t3, t4, t3 # E : mask out char matches after null diff --git a/arch/alpha/lib/fls.c b/arch/alpha/lib/fls.c new file mode 100644 index 00000000000..ddd048c0d82 --- /dev/null +++ b/arch/alpha/lib/fls.c @@ -0,0 +1,38 @@ +/* + * arch/alpha/lib/fls.c + */ + +#include <linux/module.h> +#include <linux/bitops.h> + +/* This is fls(x)-1, except zero is held to zero. This allows most + efficient input into extbl, plus it allows easy handling of fls(0)=0. */ + +const unsigned char __flsm1_tab[256] = +{ + 0, + 0, + 1, 1, + 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +}; + +EXPORT_SYMBOL(__flsm1_tab); diff --git a/arch/alpha/lib/fpreg.c b/arch/alpha/lib/fpreg.c index 97c4d9d7a4d..05017ba34c3 100644 --- a/arch/alpha/lib/fpreg.c +++ b/arch/alpha/lib/fpreg.c @@ -4,7 +4,7 @@ * (C) Copyright 1998 Linus Torvalds */ -#if defined(__alpha_cix__) || defined(__alpha_fix__) +#if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67) #define STT(reg,val) asm volatile ("ftoit $f"#reg",%0" : "=r"(val)); #else #define STT(reg,val) asm volatile ("stt $f"#reg",%0" : "=m"(val)); @@ -53,7 +53,7 @@ alpha_read_fp_reg (unsigned long reg) return val; } -#if defined(__alpha_cix__) || defined(__alpha_fix__) +#if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67) #define LDT(reg,val) asm volatile ("itoft %0,$f"#reg : : "r"(val)); #else #define LDT(reg,val) asm volatile ("ldt $f"#reg",%0" : : "m"(val)); @@ -98,7 +98,7 @@ alpha_write_fp_reg (unsigned long reg, unsigned long val) } } -#if defined(__alpha_cix__) || defined(__alpha_fix__) +#if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67) #define STS(reg,val) asm volatile ("ftois $f"#reg",%0" : "=r"(val)); #else #define STS(reg,val) asm volatile ("sts $f"#reg",%0" : "=m"(val)); @@ -147,7 +147,7 @@ alpha_read_fp_reg_s (unsigned long reg) return val; } -#if defined(__alpha_cix__) || defined(__alpha_fix__) +#if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67) #define LDS(reg,val) asm volatile ("itofs %0,$f"#reg : : "r"(val)); #else #define LDS(reg,val) asm volatile ("lds $f"#reg",%0" : : "m"(val)); diff --git a/arch/alpha/lib/memset.S b/arch/alpha/lib/memset.S index 8ff6e7e1773..76ccc6d1f36 100644 --- a/arch/alpha/lib/memset.S +++ b/arch/alpha/lib/memset.S @@ -1,5 +1,5 @@ /* - * linux/arch/alpha/memset.S + * linux/arch/alpha/lib/memset.S * * This is an efficient (and small) implementation of the C library "memset()" * function for the alpha. @@ -19,11 +19,13 @@ .text .globl memset .globl __memset + .globl ___memset .globl __memsetw .globl __constant_c_memset - .ent __memset + + .ent ___memset .align 5 -__memset: +___memset: .frame $30,0,$26,0 .prologue 0 @@ -103,7 +105,7 @@ within_one_quad: end: ret $31,($26),1 /* E1 */ - .end __memset + .end ___memset .align 5 .ent __memsetw @@ -121,4 +123,5 @@ __memsetw: .end __memsetw -memset = __memset +memset = ___memset +__memset = ___memset diff --git a/arch/alpha/lib/stacktrace.c b/arch/alpha/lib/stacktrace.c index 6d432e42aed..5e832161e6d 100644 --- a/arch/alpha/lib/stacktrace.c +++ b/arch/alpha/lib/stacktrace.c @@ -1,5 +1,4 @@ #include <linux/kernel.h> -#include <asm/system.h> typedef unsigned int instr; diff --git a/arch/alpha/lib/strcasecmp.c b/arch/alpha/lib/strcasecmp.c deleted file mode 100644 index 4e57a216fea..00000000000 --- a/arch/alpha/lib/strcasecmp.c +++ /dev/null @@ -1,26 +0,0 @@ -/* - * linux/arch/alpha/lib/strcasecmp.c - */ - -#include <linux/string.h> - - -/* We handle nothing here except the C locale. Since this is used in - only one place, on strings known to contain only 7 bit ASCII, this - is ok. */ - -int strcasecmp(const char *a, const char *b) -{ - int ca, cb; - - do { - ca = *a++ & 0xff; - cb = *b++ & 0xff; - if (ca >= 'A' && ca <= 'Z') - ca += 'a' - 'A'; - if (cb >= 'A' && cb <= 'Z') - cb += 'a' - 'A'; - } while (ca == cb && ca != '\0'); - - return ca - cb; -} diff --git a/arch/alpha/lib/strlen_user.S b/arch/alpha/lib/strlen_user.S deleted file mode 100644 index 508a18e9647..00000000000 --- a/arch/alpha/lib/strlen_user.S +++ /dev/null @@ -1,91 +0,0 @@ -/* - * arch/alpha/lib/strlen_user.S - * - * Return the length of the string including the NUL terminator - * (strlen+1) or zero if an error occurred. - * - * In places where it is critical to limit the processing time, - * and the data is not trusted, strnlen_user() should be used. - * It will return a value greater than its second argument if - * that limit would be exceeded. This implementation is allowed - * to access memory beyond the limit, but will not cross a page - * boundary when doing so. - */ - -#include <asm/regdef.h> - - -/* Allow an exception for an insn; exit if we get one. */ -#define EX(x,y...) \ - 99: x,##y; \ - .section __ex_table,"a"; \ - .long 99b - .; \ - lda v0, $exception-99b(zero); \ - .previous - - - .set noreorder - .set noat - .text - - .globl __strlen_user - .ent __strlen_user - .frame sp, 0, ra - - .align 3 -__strlen_user: - ldah a1, 32767(zero) # do not use plain strlen_user() for strings - # that might be almost 2 GB long; you should - # be using strnlen_user() instead - - .globl __strnlen_user - - .align 3 -__strnlen_user: - .prologue 0 - - EX( ldq_u t0, 0(a0) ) # load first quadword (a0 may be misaligned) - lda t1, -1(zero) - insqh t1, a0, t1 - andnot a0, 7, v0 - or t1, t0, t0 - subq a0, 1, a0 # get our +1 for the return - cmpbge zero, t0, t1 # t1 <- bitmask: bit i == 1 <==> i-th byte == 0 - subq a1, 7, t2 - subq a0, v0, t0 - bne t1, $found - - addq t2, t0, t2 - addq a1, 1, a1 - - .align 3 -$loop: ble t2, $limit - EX( ldq t0, 8(v0) ) - subq t2, 8, t2 - addq v0, 8, v0 # addr += 8 - cmpbge zero, t0, t1 - beq t1, $loop - -$found: negq t1, t2 # clear all but least set bit - and t1, t2, t1 - - and t1, 0xf0, t2 # binary search for that set bit - and t1, 0xcc, t3 - and t1, 0xaa, t4 - cmovne t2, 4, t2 - cmovne t3, 2, t3 - cmovne t4, 1, t4 - addq t2, t3, t2 - addq v0, t4, v0 - addq v0, t2, v0 - nop # dual issue next two on ev4 and ev5 - subq v0, a0, v0 -$exception: - ret - - .align 3 # currently redundant -$limit: - subq a1, t2, v0 - ret - - .end __strlen_user diff --git a/arch/alpha/lib/strncpy.S b/arch/alpha/lib/strncpy.S index 338551c7113..a46f7f3ad8c 100644 --- a/arch/alpha/lib/strncpy.S +++ b/arch/alpha/lib/strncpy.S @@ -35,7 +35,7 @@ strncpy: or $3, $24, $3 # clear the bits between the last or $4, $27, $4 # written byte and the last byte in COUNT - andnot $4, $3, $4 + andnot $3, $4, $4 zap $1, $4, $1 stq_u $1, 0($16) @@ -43,8 +43,8 @@ strncpy: .align 4 $multiword: - subq $24, 1, $2 # clear the final bits in the prev word - or $2, $24, $2 + subq $27, 1, $2 # clear the final bits in the prev word + or $2, $27, $2 zapnot $1, $2, $1 subq $18, 1, $18 @@ -70,8 +70,8 @@ $multiword: bne $18, 0b 1: ldq_u $1, 0($16) # clear the leading bits in the final word - subq $27, 1, $2 - or $2, $27, $2 + subq $24, 1, $2 + or $2, $24, $2 zap $1, $2, $1 stq_u $1, 0($16) diff --git a/arch/alpha/lib/strncpy_from_user.S b/arch/alpha/lib/strncpy_from_user.S deleted file mode 100644 index 73ee21160ff..00000000000 --- a/arch/alpha/lib/strncpy_from_user.S +++ /dev/null @@ -1,339 +0,0 @@ -/* - * arch/alpha/lib/strncpy_from_user.S - * Contributed by Richard Henderson (rth@tamu.edu) - * - * Just like strncpy except in the return value: - * - * -EFAULT if an exception occurs before the terminator is copied. - * N if the buffer filled. - * - * Otherwise the length of the string is returned. - */ - - -#include <asm/errno.h> -#include <asm/regdef.h> - - -/* Allow an exception for an insn; exit if we get one. */ -#define EX(x,y...) \ - 99: x,##y; \ - .section __ex_table,"a"; \ - .long 99b - .; \ - lda $31, $exception-99b($0); \ - .previous - - - .set noat - .set noreorder - .text - - .globl __strncpy_from_user - .ent __strncpy_from_user - .frame $30, 0, $26 - .prologue 0 - - .align 3 -$aligned: - /* On entry to this basic block: - t0 == the first destination word for masking back in - t1 == the first source word. */ - - /* Create the 1st output word and detect 0's in the 1st input word. */ - lda t2, -1 # e1 : build a mask against false zero - mskqh t2, a1, t2 # e0 : detection in the src word - mskqh t1, a1, t3 # e0 : - ornot t1, t2, t2 # .. e1 : - mskql t0, a1, t0 # e0 : assemble the first output word - cmpbge zero, t2, t8 # .. e1 : bits set iff null found - or t0, t3, t0 # e0 : - beq a2, $a_eoc # .. e1 : - bne t8, $a_eos # .. e1 : - - /* On entry to this basic block: - t0 == a source word not containing a null. */ - -$a_loop: - stq_u t0, 0(a0) # e0 : - addq a0, 8, a0 # .. e1 : - EX( ldq_u t0, 0(a1) ) # e0 : - addq a1, 8, a1 # .. e1 : - subq a2, 1, a2 # e0 : - cmpbge zero, t0, t8 # .. e1 (stall) - beq a2, $a_eoc # e1 : - beq t8, $a_loop # e1 : - - /* Take care of the final (partial) word store. At this point - the end-of-count bit is set in t8 iff it applies. - - On entry to this basic block we have: - t0 == the source word containing the null - t8 == the cmpbge mask that found it. */ - -$a_eos: - negq t8, t12 # e0 : find low bit set - and t8, t12, t12 # e1 (stall) - - /* For the sake of the cache, don't read a destination word - if we're not going to need it. */ - and t12, 0x80, t6 # e0 : - bne t6, 1f # .. e1 (zdb) - - /* We're doing a partial word store and so need to combine - our source and original destination words. */ - ldq_u t1, 0(a0) # e0 : - subq t12, 1, t6 # .. e1 : - or t12, t6, t8 # e0 : - unop # - zapnot t0, t8, t0 # e0 : clear src bytes > null - zap t1, t8, t1 # .. e1 : clear dst bytes <= null - or t0, t1, t0 # e1 : - -1: stq_u t0, 0(a0) - br $finish_up - - /* Add the end-of-count bit to the eos detection bitmask. */ -$a_eoc: - or t10, t8, t8 - br $a_eos - - /*** The Function Entry Point ***/ - .align 3 -__strncpy_from_user: - mov a0, v0 # save the string start - beq a2, $zerolength - - /* Are source and destination co-aligned? */ - xor a0, a1, t1 # e0 : - and a0, 7, t0 # .. e1 : find dest misalignment - and t1, 7, t1 # e0 : - addq a2, t0, a2 # .. e1 : bias count by dest misalignment - subq a2, 1, a2 # e0 : - and a2, 7, t2 # e1 : - srl a2, 3, a2 # e0 : a2 = loop counter = (count - 1)/8 - addq zero, 1, t10 # .. e1 : - sll t10, t2, t10 # e0 : t10 = bitmask of last count byte - bne t1, $unaligned # .. e1 : - - /* We are co-aligned; take care of a partial first word. */ - - EX( ldq_u t1, 0(a1) ) # e0 : load first src word - addq a1, 8, a1 # .. e1 : - - beq t0, $aligned # avoid loading dest word if not needed - ldq_u t0, 0(a0) # e0 : - br $aligned # .. e1 : - - -/* The source and destination are not co-aligned. Align the destination - and cope. We have to be very careful about not reading too much and - causing a SEGV. */ - - .align 3 -$u_head: - /* We know just enough now to be able to assemble the first - full source word. We can still find a zero at the end of it - that prevents us from outputting the whole thing. - - On entry to this basic block: - t0 == the first dest word, unmasked - t1 == the shifted low bits of the first source word - t6 == bytemask that is -1 in dest word bytes */ - - EX( ldq_u t2, 8(a1) ) # e0 : load second src word - addq a1, 8, a1 # .. e1 : - mskql t0, a0, t0 # e0 : mask trailing garbage in dst - extqh t2, a1, t4 # e0 : - or t1, t4, t1 # e1 : first aligned src word complete - mskqh t1, a0, t1 # e0 : mask leading garbage in src - or t0, t1, t0 # e0 : first output word complete - or t0, t6, t6 # e1 : mask original data for zero test - cmpbge zero, t6, t8 # e0 : - beq a2, $u_eocfin # .. e1 : - bne t8, $u_final # e1 : - - lda t6, -1 # e1 : mask out the bits we have - mskql t6, a1, t6 # e0 : already seen - stq_u t0, 0(a0) # e0 : store first output word - or t6, t2, t2 # .. e1 : - cmpbge zero, t2, t8 # e0 : find nulls in second partial - addq a0, 8, a0 # .. e1 : - subq a2, 1, a2 # e0 : - bne t8, $u_late_head_exit # .. e1 : - - /* Finally, we've got all the stupid leading edge cases taken care - of and we can set up to enter the main loop. */ - - extql t2, a1, t1 # e0 : position hi-bits of lo word - EX( ldq_u t2, 8(a1) ) # .. e1 : read next high-order source word - addq a1, 8, a1 # e0 : - cmpbge zero, t2, t8 # e1 (stall) - beq a2, $u_eoc # e1 : - bne t8, $u_eos # e1 : - - /* Unaligned copy main loop. In order to avoid reading too much, - the loop is structured to detect zeros in aligned source words. - This has, unfortunately, effectively pulled half of a loop - iteration out into the head and half into the tail, but it does - prevent nastiness from accumulating in the very thing we want - to run as fast as possible. - - On entry to this basic block: - t1 == the shifted high-order bits from the previous source word - t2 == the unshifted current source word - - We further know that t2 does not contain a null terminator. */ - - .align 3 -$u_loop: - extqh t2, a1, t0 # e0 : extract high bits for current word - addq a1, 8, a1 # .. e1 : - extql t2, a1, t3 # e0 : extract low bits for next time - addq a0, 8, a0 # .. e1 : - or t0, t1, t0 # e0 : current dst word now complete - EX( ldq_u t2, 0(a1) ) # .. e1 : load high word for next time - stq_u t0, -8(a0) # e0 : save the current word - mov t3, t1 # .. e1 : - subq a2, 1, a2 # e0 : - cmpbge zero, t2, t8 # .. e1 : test new word for eos - beq a2, $u_eoc # e1 : - beq t8, $u_loop # e1 : - - /* We've found a zero somewhere in the source word we just read. - If it resides in the lower half, we have one (probably partial) - word to write out, and if it resides in the upper half, we - have one full and one partial word left to write out. - - On entry to this basic block: - t1 == the shifted high-order bits from the previous source word - t2 == the unshifted current source word. */ -$u_eos: - extqh t2, a1, t0 # e0 : - or t0, t1, t0 # e1 : first (partial) source word complete - - cmpbge zero, t0, t8 # e0 : is the null in this first bit? - bne t8, $u_final # .. e1 (zdb) - - stq_u t0, 0(a0) # e0 : the null was in the high-order bits - addq a0, 8, a0 # .. e1 : - subq a2, 1, a2 # e1 : - -$u_late_head_exit: - extql t2, a1, t0 # .. e0 : - cmpbge zero, t0, t8 # e0 : - or t8, t10, t6 # e1 : - cmoveq a2, t6, t8 # e0 : - nop # .. e1 : - - /* Take care of a final (probably partial) result word. - On entry to this basic block: - t0 == assembled source word - t8 == cmpbge mask that found the null. */ -$u_final: - negq t8, t6 # e0 : isolate low bit set - and t6, t8, t12 # e1 : - - and t12, 0x80, t6 # e0 : avoid dest word load if we can - bne t6, 1f # .. e1 (zdb) - - ldq_u t1, 0(a0) # e0 : - subq t12, 1, t6 # .. e1 : - or t6, t12, t8 # e0 : - zapnot t0, t8, t0 # .. e1 : kill source bytes > null - zap t1, t8, t1 # e0 : kill dest bytes <= null - or t0, t1, t0 # e1 : - -1: stq_u t0, 0(a0) # e0 : - br $finish_up - -$u_eoc: # end-of-count - extqh t2, a1, t0 - or t0, t1, t0 - cmpbge zero, t0, t8 - -$u_eocfin: # end-of-count, final word - or t10, t8, t8 - br $u_final - - /* Unaligned copy entry point. */ - .align 3 -$unaligned: - - EX( ldq_u t1, 0(a1) ) # e0 : load first source word - - and a0, 7, t4 # .. e1 : find dest misalignment - and a1, 7, t5 # e0 : find src misalignment - - /* Conditionally load the first destination word and a bytemask - with 0xff indicating that the destination byte is sacrosanct. */ - - mov zero, t0 # .. e1 : - mov zero, t6 # e0 : - beq t4, 1f # .. e1 : - ldq_u t0, 0(a0) # e0 : - lda t6, -1 # .. e1 : - mskql t6, a0, t6 # e0 : -1: - subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr - - /* If source misalignment is larger than dest misalignment, we need - extra startup checks to avoid SEGV. */ - - cmplt t4, t5, t12 # e1 : - extql t1, a1, t1 # .. e0 : shift src into place - lda t2, -1 # e0 : for creating masks later - beq t12, $u_head # e1 : - - mskqh t2, t5, t2 # e0 : begin src byte validity mask - cmpbge zero, t1, t8 # .. e1 : is there a zero? - extql t2, a1, t2 # e0 : - or t8, t10, t5 # .. e1 : test for end-of-count too - cmpbge zero, t2, t3 # e0 : - cmoveq a2, t5, t8 # .. e1 : - andnot t8, t3, t8 # e0 : - beq t8, $u_head # .. e1 (zdb) - - /* At this point we've found a zero in the first partial word of - the source. We need to isolate the valid source data and mask - it into the original destination data. (Incidentally, we know - that we'll need at least one byte of that original dest word.) */ - - ldq_u t0, 0(a0) # e0 : - negq t8, t6 # .. e1 : build bitmask of bytes <= zero - mskqh t1, t4, t1 # e0 : - and t6, t8, t12 # .. e1 : - subq t12, 1, t6 # e0 : - or t6, t12, t8 # e1 : - - zapnot t2, t8, t2 # e0 : prepare source word; mirror changes - zapnot t1, t8, t1 # .. e1 : to source validity mask - - andnot t0, t2, t0 # e0 : zero place for source to reside - or t0, t1, t0 # e1 : and put it there - stq_u t0, 0(a0) # e0 : - -$finish_up: - zapnot t0, t12, t4 # was last byte written null? - cmovne t4, 1, t4 - - and t12, 0xf0, t3 # binary search for the address of the - and t12, 0xcc, t2 # last byte written - and t12, 0xaa, t1 - bic a0, 7, t0 - cmovne t3, 4, t3 - cmovne t2, 2, t2 - cmovne t1, 1, t1 - addq t0, t3, t0 - addq t1, t2, t1 - addq t0, t1, t0 - addq t0, t4, t0 # add one if we filled the buffer - - subq t0, v0, v0 # find string length - ret - -$zerolength: - clr v0 -$exception: - ret - - .end __strncpy_from_user diff --git a/arch/alpha/lib/strrchr.S b/arch/alpha/lib/strrchr.S index 82cfd0ac907..1970dc07cfd 100644 --- a/arch/alpha/lib/strrchr.S +++ b/arch/alpha/lib/strrchr.S @@ -54,7 +54,7 @@ $loop: $eos: negq t1, t4 # e0 : isolate first null byte match and t1, t4, t4 # e1 : - subq t4, 1, t5 # e0 : build a mask of the bytes upto... + subq t4, 1, t5 # e0 : build a mask of the bytes up to... or t4, t5, t4 # e1 : ... and including the null and t3, t4, t3 # e0 : mask out char matches after null diff --git a/arch/alpha/lib/stxncpy.S b/arch/alpha/lib/stxncpy.S index da1a72740d2..3dece25283a 100644 --- a/arch/alpha/lib/stxncpy.S +++ b/arch/alpha/lib/stxncpy.S @@ -315,9 +315,9 @@ $unaligned: extql t2, a1, t2 # e0 : cmpbge zero, t1, t8 # .. e1 : is there a zero? - andnot t2, t6, t12 # e0 : dest mask for a single word copy + andnot t2, t6, t2 # e0 : dest mask for a single word copy or t8, t10, t5 # .. e1 : test for end-of-count too - cmpbge zero, t12, t3 # e0 : + cmpbge zero, t2, t3 # e0 : cmoveq a2, t5, t8 # .. e1 : andnot t8, t3, t8 # e0 : beq t8, $u_head # .. e1 (zdb) @@ -330,14 +330,14 @@ $unaligned: ldq_u t0, 0(a0) # e0 : negq t8, t6 # .. e1 : build bitmask of bytes <= zero mskqh t1, t4, t1 # e0 : - and t6, t8, t2 # .. e1 : - subq t2, 1, t6 # e0 : - or t6, t2, t8 # e1 : + and t6, t8, t12 # .. e1 : + subq t12, 1, t6 # e0 : + or t6, t12, t8 # e1 : - zapnot t12, t8, t12 # e0 : prepare source word; mirror changes + zapnot t2, t8, t2 # e0 : prepare source word; mirror changes zapnot t1, t8, t1 # .. e1 : to source validity mask - andnot t0, t12, t0 # e0 : zero place for source to reside + andnot t0, t2, t0 # e0 : zero place for source to reside or t0, t1, t0 # e1 : and put it there stq_u t0, 0(a0) # e0 : ret (t9) # .. e1 : diff --git a/arch/alpha/lib/udelay.c b/arch/alpha/lib/udelay.c index 1c879bbce41..69d52aa37ba 100644 --- a/arch/alpha/lib/udelay.c +++ b/arch/alpha/lib/udelay.c @@ -4,7 +4,6 @@ * Delay routines, using a pre-computed "loops_per_jiffy" value. */ -#include <linux/config.h> #include <linux/module.h> #include <linux/sched.h> /* for udelay's use of smp_processor_id */ #include <asm/param.h> |
