diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2007-10-11 11:16:33 +0200 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2007-10-11 11:16:33 +0200 |
commit | 44f0257fc316ff4b33aa3438dd8d891b7d6d72b9 (patch) | |
tree | c1a9a571db37d631489f18e1dfe5554874b19027 /arch/x86/lib | |
parent | da957e111bb0c189a4a3bf8a00caaecb59ed94ca (diff) |
i386: move lib
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/lib')
-rw-r--r-- | arch/x86/lib/Makefile | 5 | ||||
-rw-r--r-- | arch/x86/lib/Makefile_32 | 11 | ||||
-rw-r--r-- | arch/x86/lib/bitops_32.c | 70 | ||||
-rw-r--r-- | arch/x86/lib/checksum_32.S | 546 | ||||
-rw-r--r-- | arch/x86/lib/delay_32.c | 103 | ||||
-rw-r--r-- | arch/x86/lib/getuser_32.S | 78 | ||||
-rw-r--r-- | arch/x86/lib/memcpy_32.c | 43 | ||||
-rw-r--r-- | arch/x86/lib/mmx_32.c | 403 | ||||
-rw-r--r-- | arch/x86/lib/msr-on-cpu.c | 119 | ||||
-rw-r--r-- | arch/x86/lib/putuser_32.S | 98 | ||||
-rw-r--r-- | arch/x86/lib/semaphore_32.S | 219 | ||||
-rw-r--r-- | arch/x86/lib/string_32.c | 257 | ||||
-rw-r--r-- | arch/x86/lib/strstr_32.c | 31 | ||||
-rw-r--r-- | arch/x86/lib/usercopy_32.c | 882 |
14 files changed, 2865 insertions, 0 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile new file mode 100644 index 00000000000..2d7d724a2a6 --- /dev/null +++ b/arch/x86/lib/Makefile @@ -0,0 +1,5 @@ +ifeq ($(CONFIG_X86_32),y) +include ${srctree}/arch/x86/lib/Makefile_32 +else +include ${srctree}/arch/x86_64/lib/Makefile_64 +endif diff --git a/arch/x86/lib/Makefile_32 b/arch/x86/lib/Makefile_32 new file mode 100644 index 00000000000..98d1f1e2e2e --- /dev/null +++ b/arch/x86/lib/Makefile_32 @@ -0,0 +1,11 @@ +# +# Makefile for i386-specific library files.. +# + + +lib-y = checksum_32.o delay_32.o usercopy_32.o getuser_32.o putuser_32.o memcpy_32.o strstr_32.o \ + bitops_32.o semaphore_32.o string_32.o + +lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o + +obj-$(CONFIG_SMP) += msr-on-cpu.o diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c new file mode 100644 index 00000000000..afd0045595d --- /dev/null +++ b/arch/x86/lib/bitops_32.c @@ -0,0 +1,70 @@ +#include <linux/bitops.h> +#include <linux/module.h> + +/** + * find_next_bit - find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +int find_next_bit(const unsigned long *addr, int size, int offset) +{ + const unsigned long *p = addr + (offset >> 5); + int set = 0, bit = offset & 31, res; + + if (bit) { + /* + * Look for nonzero in the first 32 bits: + */ + __asm__("bsfl %1,%0\n\t" + "jne 1f\n\t" + "movl $32, %0\n" + "1:" + : "=r" (set) + : "r" (*p >> bit)); + if (set < (32 - bit)) + return set + offset; + set = 32 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 32 * (p - addr)); + return (offset + set + res); +} +EXPORT_SYMBOL(find_next_bit); + +/** + * find_next_zero_bit - find the first zero bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +int find_next_zero_bit(const unsigned long *addr, int size, int offset) +{ + const unsigned long *p = addr + (offset >> 5); + int set = 0, bit = offset & 31, res; + + if (bit) { + /* + * Look for zero in the first 32 bits. + */ + __asm__("bsfl %1,%0\n\t" + "jne 1f\n\t" + "movl $32, %0\n" + "1:" + : "=r" (set) + : "r" (~(*p >> bit))); + if (set < (32 - bit)) + return set + offset; + set = 32 - bit; + p++; + } + /* + * No zero yet, search remaining full bytes for a zero + */ + res = find_first_zero_bit(p, size - 32 * (p - addr)); + return (offset + set + res); +} +EXPORT_SYMBOL(find_next_zero_bit); diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S new file mode 100644 index 00000000000..adbccd0bbb7 --- /dev/null +++ b/arch/x86/lib/checksum_32.S @@ -0,0 +1,546 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Tom May, <ftom@netcom.com> + * Pentium Pro/II routines: + * Alexander Kjeldaas <astor@guardian.no> + * Finn Arne Gangstad <finnag@guardian.no> + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception + * handling. + * Andi Kleen, add zeroing on error + * converted to pure assembler + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/errno.h> + +/* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ + +/* +unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) + */ + +.text + +#ifndef CONFIG_X86_USE_PPRO_CHECKSUM + + /* + * Experiments with Ethernet and SLIP connections show that buff + * is aligned on either a 2-byte or 4-byte boundary. We get at + * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. + * Fortunately, it is easy to convert 2-byte alignment to 4-byte + * alignment for the unrolled loop. + */ +ENTRY(csum_partial) + CFI_STARTPROC + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + movl 20(%esp),%eax # Function arg: unsigned int sum + movl 16(%esp),%ecx # Function arg: int len + movl 12(%esp),%esi # Function arg: unsigned char *buff + testl $3, %esi # Check alignment. + jz 2f # Jump if alignment is ok. + testl $1, %esi # Check alignment. + jz 10f # Jump if alignment is boundary of 2bytes. + + # buf is odd + dec %ecx + jl 8f + movzbl (%esi), %ebx + adcl %ebx, %eax + roll $8, %eax + inc %esi + testl $2, %esi + jz 2f +10: + subl $2, %ecx # Alignment uses up two bytes. + jae 1f # Jump if we had at least two bytes. + addl $2, %ecx # ecx was < 2. Deal with it. + jmp 4f +1: movw (%esi), %bx + addl $2, %esi + addw %bx, %ax + adcl $0, %eax +2: + movl %ecx, %edx + shrl $5, %ecx + jz 2f + testl %esi, %esi +1: movl (%esi), %ebx + adcl %ebx, %eax + movl 4(%esi), %ebx + adcl %ebx, %eax + movl 8(%esi), %ebx + adcl %ebx, %eax + movl 12(%esi), %ebx + adcl %ebx, %eax + movl 16(%esi), %ebx + adcl %ebx, %eax + movl 20(%esi), %ebx + adcl %ebx, %eax + movl 24(%esi), %ebx + adcl %ebx, %eax + movl 28(%esi), %ebx + adcl %ebx, %eax + lea 32(%esi), %esi + dec %ecx + jne 1b + adcl $0, %eax +2: movl %edx, %ecx + andl $0x1c, %edx + je 4f + shrl $2, %edx # This clears CF +3: adcl (%esi), %eax + lea 4(%esi), %esi + dec %edx + jne 3b + adcl $0, %eax +4: andl $3, %ecx + jz 7f + cmpl $2, %ecx + jb 5f + movw (%esi),%cx + leal 2(%esi),%esi + je 6f + shll $16,%ecx +5: movb (%esi),%cl +6: addl %ecx,%eax + adcl $0, %eax +7: + testl $1, 12(%esp) + jz 8f + roll $8, %eax +8: + popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx + popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi + ret + CFI_ENDPROC +ENDPROC(csum_partial) + +#else + +/* Version for PentiumII/PPro */ + +ENTRY(csum_partial) + CFI_STARTPROC + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + movl 20(%esp),%eax # Function arg: unsigned int sum + movl 16(%esp),%ecx # Function arg: int len + movl 12(%esp),%esi # Function arg: const unsigned char *buf + + testl $3, %esi + jnz 25f +10: + movl %ecx, %edx + movl %ecx, %ebx + andl $0x7c, %ebx + shrl $7, %ecx + addl %ebx,%esi + shrl $2, %ebx + negl %ebx + lea 45f(%ebx,%ebx,2), %ebx + testl %esi, %esi + jmp *%ebx + + # Handle 2-byte-aligned regions +20: addw (%esi), %ax + lea 2(%esi), %esi + adcl $0, %eax + jmp 10b +25: + testl $1, %esi + jz 30f + # buf is odd + dec %ecx + jl 90f + movzbl (%esi), %ebx + addl %ebx, %eax + adcl $0, %eax + roll $8, %eax + inc %esi + testl $2, %esi + jz 10b + +30: subl $2, %ecx + ja 20b + je 32f + addl $2, %ecx + jz 80f + movzbl (%esi),%ebx # csumming 1 byte, 2-aligned + addl %ebx, %eax + adcl $0, %eax + jmp 80f +32: + addw (%esi), %ax # csumming 2 bytes, 2-aligned + adcl $0, %eax + jmp 80f + +40: + addl -128(%esi), %eax + adcl -124(%esi), %eax + adcl -120(%esi), %eax + adcl -116(%esi), %eax + adcl -112(%esi), %eax + adcl -108(%esi), %eax + adcl -104(%esi), %eax + adcl -100(%esi), %eax + adcl -96(%esi), %eax + adcl -92(%esi), %eax + adcl -88(%esi), %eax + adcl -84(%esi), %eax + adcl -80(%esi), %eax + adcl -76(%esi), %eax + adcl -72(%esi), %eax + adcl -68(%esi), %eax + adcl -64(%esi), %eax + adcl -60(%esi), %eax + adcl -56(%esi), %eax + adcl -52(%esi), %eax + adcl -48(%esi), %eax + adcl -44(%esi), %eax + adcl -40(%esi), %eax + adcl -36(%esi), %eax + adcl -32(%esi), %eax + adcl -28(%esi), %eax + adcl -24(%esi), %eax + adcl -20(%esi), %eax + adcl -16(%esi), %eax + adcl -12(%esi), %eax + adcl -8(%esi), %eax + adcl -4(%esi), %eax +45: + lea 128(%esi), %esi + adcl $0, %eax + dec %ecx + jge 40b + movl %edx, %ecx +50: andl $3, %ecx + jz 80f + + # Handle the last 1-3 bytes without jumping + notl %ecx # 1->2, 2->1, 3->0, higher bits are masked + movl $0xffffff,%ebx # by the shll and shrl instructions + shll $3,%ecx + shrl %cl,%ebx + andl -128(%esi),%ebx # esi is 4-aligned so should be ok + addl %ebx,%eax + adcl $0,%eax +80: + testl $1, 12(%esp) + jz 90f + roll $8, %eax +90: + popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx + popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi + ret + CFI_ENDPROC +ENDPROC(csum_partial) + +#endif + +/* +unsigned int csum_partial_copy_generic (const char *src, char *dst, + int len, int sum, int *src_err_ptr, int *dst_err_ptr) + */ + +/* + * Copy from ds while checksumming, otherwise like csum_partial + * + * The macros SRC and DST specify the type of access for the instruction. + * thus we can call a custom exception handler for all access types. + * + * FIXME: could someone double-check whether I haven't mixed up some SRC and + * DST definitions? It's damn hard to trigger all cases. I hope I got + * them all but there's no guarantee. + */ + +#define SRC(y...) \ + 9999: y; \ + .section __ex_table, "a"; \ + .long 9999b, 6001f ; \ + .previous + +#define DST(y...) \ + 9999: y; \ + .section __ex_table, "a"; \ + .long 9999b, 6002f ; \ + .previous + +#ifndef CONFIG_X86_USE_PPRO_CHECKSUM + +#define ARGBASE 16 +#define FP 12 + +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC + subl $4,%esp + CFI_ADJUST_CFA_OFFSET 4 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + movl ARGBASE+16(%esp),%eax # sum + movl ARGBASE+12(%esp),%ecx # len + movl ARGBASE+4(%esp),%esi # src + movl ARGBASE+8(%esp),%edi # dst + + testl $2, %edi # Check alignment. + jz 2f # Jump if alignment is ok. + subl $2, %ecx # Alignment uses up two bytes. + jae 1f # Jump if we had at least two bytes. + addl $2, %ecx # ecx was < 2. Deal with it. + jmp 4f +SRC(1: movw (%esi), %bx ) + addl $2, %esi +DST( movw %bx, (%edi) ) + addl $2, %edi + addw %bx, %ax + adcl $0, %eax +2: + movl %ecx, FP(%esp) + shrl $5, %ecx + jz 2f + testl %esi, %esi +SRC(1: movl (%esi), %ebx ) +SRC( movl 4(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, (%edi) ) + adcl %edx, %eax +DST( movl %edx, 4(%edi) ) + +SRC( movl 8(%esi), %ebx ) +SRC( movl 12(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 8(%edi) ) + adcl %edx, %eax +DST( movl %edx, 12(%edi) ) + +SRC( movl 16(%esi), %ebx ) +SRC( movl 20(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 16(%edi) ) + adcl %edx, %eax +DST( movl %edx, 20(%edi) ) + +SRC( movl 24(%esi), %ebx ) +SRC( movl 28(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 24(%edi) ) + adcl %edx, %eax +DST( movl %edx, 28(%edi) ) + + lea 32(%esi), %esi + lea 32(%edi), %edi + dec %ecx + jne 1b + adcl $0, %eax +2: movl FP(%esp), %edx + movl %edx, %ecx + andl $0x1c, %edx + je 4f + shrl $2, %edx # This clears CF +SRC(3: movl (%esi), %ebx ) + adcl %ebx, %eax +DST( movl %ebx, (%edi) ) + lea 4(%esi), %esi + lea 4(%edi), %edi + dec %edx + jne 3b + adcl $0, %eax +4: andl $3, %ecx + jz 7f + cmpl $2, %ecx + jb 5f +SRC( movw (%esi), %cx ) + leal 2(%esi), %esi +DST( movw %cx, (%edi) ) + leal 2(%edi), %edi + je 6f + shll $16,%ecx +SRC(5: movb (%esi), %cl ) +DST( movb %cl, (%edi) ) +6: addl %ecx, %eax + adcl $0, %eax +7: +5000: + +# Exception handler: +.section .fixup, "ax" + +6001: + movl ARGBASE+20(%esp), %ebx # src_err_ptr + movl $-EFAULT, (%ebx) + + # zero the complete destination - computing the rest + # is too much work + movl ARGBASE+8(%esp), %edi # dst + movl ARGBASE+12(%esp), %ecx # len + xorl %eax,%eax + rep ; stosb + + jmp 5000b + +6002: + movl ARGBASE+24(%esp), %ebx # dst_err_ptr + movl $-EFAULT,(%ebx) + jmp 5000b + +.previous + + popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx + popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi + popl %edi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edi + popl %ecx # equivalent to addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + ret + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic) + +#else + +/* Version for PentiumII/PPro */ + +#define ROUND1(x) \ + SRC(movl x(%esi), %ebx ) ; \ + addl %ebx, %eax ; \ + DST(movl %ebx, x(%edi) ) ; + +#define ROUND(x) \ + SRC(movl x(%esi), %ebx ) ; \ + adcl %ebx, %eax ; \ + DST(movl %ebx, x(%edi) ) ; + +#define ARGBASE 12 + +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + movl ARGBASE+4(%esp),%esi #src + movl ARGBASE+8(%esp),%edi #dst + movl ARGBASE+12(%esp),%ecx #len + movl ARGBASE+16(%esp),%eax #sum +# movl %ecx, %edx + movl %ecx, %ebx + movl %esi, %edx + shrl $6, %ecx + andl $0x3c, %ebx + negl %ebx + subl %ebx, %esi + subl %ebx, %edi + lea -1(%esi),%edx + andl $-32,%edx + lea 3f(%ebx,%ebx), %ebx + testl %esi, %esi + jmp *%ebx +1: addl $64,%esi + addl $64,%edi + SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) + ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) + ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) + ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) + ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4) +3: adcl $0,%eax + addl $64, %edx + dec %ecx + jge 1b +4: movl ARGBASE+12(%esp),%edx #len + andl $3, %edx + jz 7f + cmpl $2, %edx + jb 5f +SRC( movw (%esi), %dx ) + leal 2(%esi), %esi +DST( movw %dx, (%edi) ) + leal 2(%edi), %edi + je 6f + shll $16,%edx +5: +SRC( movb (%esi), %dl ) +DST( movb %dl, (%edi) ) +6: addl %edx, %eax + adcl $0, %eax +7: +.section .fixup, "ax" +6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr + movl $-EFAULT, (%ebx) + # zero the complete destination (computing the rest is too much work) + movl ARGBASE+8(%esp),%edi # dst + movl ARGBASE+12(%esp),%ecx # len + xorl %eax,%eax + rep; stosb + jmp 7b +6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr + movl $-EFAULT, (%ebx) + jmp 7b +.previous + + popl %esi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE esi + popl %edi + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE edi + popl %ebx + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE ebx + ret + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic) + +#undef ROUND +#undef ROUND1 + +#endif diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c new file mode 100644 index 00000000000..f6edb11364d --- /dev/null +++ b/arch/x86/lib/delay_32.c @@ -0,0 +1,103 @@ +/* + * Precise Delay Loops for i386 + * + * Copyright (C) 1993 Linus Torvalds + * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * + * The __delay function must _NOT_ be inlined as its execution time + * depends wildly on alignment on many x86 processors. The additional + * jump magic is needed to get the timing stable on all the CPU's + * we have to worry about. + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/delay.h> + +#include <asm/processor.h> +#include <asm/delay.h> +#include <asm/timer.h> + +#ifdef CONFIG_SMP +# include <asm/smp.h> +#endif + +/* simple loop based delay: */ +static void delay_loop(unsigned long loops) +{ + int d0; + + __asm__ __volatile__( + "\tjmp 1f\n" + ".align 16\n" + "1:\tjmp 2f\n" + ".align 16\n" + "2:\tdecl %0\n\tjns 2b" + :"=&a" (d0) + :"0" (loops)); +} + +/* TSC based delay: */ +static void delay_tsc(unsigned long loops) +{ + unsigned long bclock, now; + + rdtscl(bclock); + do { + rep_nop(); + rdtscl(now); + } while ((now-bclock) < loops); +} + +/* + * Since we calibrate only once at boot, this + * function should be set once at boot and not changed + */ +static void (*delay_fn)(unsigned long) = delay_loop; + +void use_tsc_delay(void) +{ + delay_fn = delay_tsc; +} + +int read_current_timer(unsigned long *timer_val) +{ + if (delay_fn == delay_tsc) { + rdtscl(*timer_val); + return 0; + } + return -1; +} + +void __delay(unsigned long loops) +{ + delay_fn(loops); +} + +inline void __const_udelay(unsigned long xloops) +{ + int d0; + + xloops *= 4; + __asm__("mull %0" + :"=d" (xloops), "=&a" (d0) + :"1" (xloops), "0" + (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4))); + + __delay(++xloops); +} + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ +} + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ +} + +EXPORT_SYMBOL(__delay); +EXPORT_SYMBOL(__const_udelay); +EXPORT_SYMBOL(__udelay); +EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86/lib/getuser_32.S b/arch/x86/lib/getuser_32.S new file mode 100644 index 00000000000..6d84b53f12a --- /dev/null +++ b/arch/x86/lib/getuser_32.S @@ -0,0 +1,78 @@ +/* + * __get_user functions. + * + * (C) Copyright 1998 Linus Torvalds + * + * These functions have a non-standard call interface + * to make them more efficient, especially as they + * return an error value in addition to the "real" + * return value. + */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/thread_info.h> + + +/* + * __get_user_X + * + * Inputs: %eax contains the address + * + * Outputs: %eax is error code (0 or -EFAULT) + * %edx contains zero-extended value + * + * These functions should not modify any other registers, + * as they get called from within inline assembly. + */ + +.text +ENTRY(__get_user_1) + CFI_STARTPROC + GET_THREAD_INFO(%edx) + cmpl TI_addr_limit(%edx),%eax + jae bad_get_user +1: movzbl (%eax),%edx + xorl %eax,%eax + ret + CFI_ENDPROC +ENDPROC(__get_user_1) + +ENTRY(__get_user_2) + CFI_STARTPROC + addl $1,%eax + jc bad_get_user + GET_THREAD_INFO(%edx) + cmpl TI_addr_limit(%edx),%eax + jae bad_get_user +2: movzwl -1(%eax),%edx + xorl %eax,%eax + ret + CFI_ENDPROC +ENDPROC(__get_user_2) + +ENTRY(__get_user_4) + CFI_STARTPROC + addl $3,%eax + jc bad_get_user + GET_THREAD_INFO(%edx) + cmpl TI_addr_limit(%edx),%eax + jae bad_get_user +3: movl -3(%eax),%edx + xorl %eax,%eax + ret + CFI_ENDPROC +ENDPROC(__get_user_4) + +bad_get_user: + CFI_STARTPROC + xorl %edx,%edx + movl $-14,%eax + ret + CFI_ENDPROC +END(bad_get_user) + +.section __ex_table,"a" + .long 1b,bad_get_user + .long 2b,bad_get_user + .long 3b,bad_get_user +.previous diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c new file mode 100644 index 00000000000..8ac51b82a63 --- /dev/null +++ b/arch/x86/lib/memcpy_32.c @@ -0,0 +1,43 @@ +#include <linux/string.h> +#include <linux/module.h> + +#undef memcpy +#undef memset + +void *memcpy(void *to, const void *from, size_t n) +{ +#ifdef CONFIG_X86_USE_3DNOW + return __memcpy3d(to, from, n); +#else + return __memcpy(to, from, n); +#endif +} +EXPORT_SYMBOL(memcpy); + +void *memset(void *s, int c, size_t count) +{ + return __memset(s, c, count); +} +EXPORT_SYMBOL(memset); + +void *memmove(void *dest, const void *src, size_t n) +{ + int d0, d1, d2; + + if (dest < src) { + memcpy(dest,src,n); + } else { + __asm__ __volatile__( + "std\n\t" + "rep\n\t" + "movsb\n\t" + "cld" + : "=&c" (d0), "=&S" (d1), "=&D" (d2) + :"0" (n), + "1" (n-1+(const char *)src), + "2" (n-1+(char *)dest) + :"memory"); + } + return dest; +} +EXPORT_SYMBOL(memmove); diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c new file mode 100644 index 00000000000..28084d2e8dd --- /dev/null +++ b/arch/x86/lib/mmx_32.c @@ -0,0 +1,403 @@ +#include <linux/types.h> +#include <linux/string.h> +#include <linux/sched.h> +#include <linux/hardirq.h> +#include <linux/module.h> + +#include <asm/i387.h> + + +/* + * MMX 3DNow! library helper functions + * + * To do: + * We can use MMX just for prefetch in IRQ's. This may be a win. + * (reported so on K6-III) + * We should use a better code neutral filler for the short jump + * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? + * We also want to clobber the filler register so we don't get any + * register forwarding stalls on the filler. + * + * Add *user handling. Checksums are not a win with MMX on any CPU + * tested so far for any MMX solution figured. + * + * 22/09/2000 - Arjan van de Ven + * Improved for non-egineering-sample Athlons + * + */ + +void *_mmx_memcpy(void *to, const void *from, size_t len) +{ + void *p; + int i; + + if (unlikely(in_interrupt())) + return __memcpy(to, from, len); + + p = to; + i = len >> 6; /* len/64 */ + + kernel_fpu_begin(); + + __asm__ __volatile__ ( + "1: prefetch (%0)\n" /* This set is 28 bytes */ + " prefetch 64(%0)\n" + " prefetch 128(%0)\n" + " prefetch 192(%0)\n" + " prefetch 256(%0)\n" + "2: \n" + ".section .fixup, \"ax\"\n" + "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from) ); + + + for(; i>5; i--) + { + __asm__ __volatile__ ( + "1: prefetch 320(%0)\n" + "2: movq (%0), %%mm0\n" + " movq 8(%0), %%mm1\n" + " movq 16(%0), %%mm2\n" + " movq 24(%0), %%mm3\n" + " movq %%mm0, (%1)\n" + " movq %%mm1, 8(%1)\n" + " movq %%mm2, 16(%1)\n" + " movq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm0\n" + " movq 40(%0), %%mm1\n" + " movq 48(%0), %%mm2\n" + " movq 56(%0), %%mm3\n" + " movq %%mm0, 32(%1)\n" + " movq %%mm1, 40(%1)\n" + " movq %%mm2, 48(%1)\n" + " movq %%mm3, 56(%1)\n" + ".section .fixup, \"ax\"\n" + "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from), "r" (to) : "memory"); + from+=64; + to+=64; + } + + for(; i>0; i--) + { + __asm__ __volatile__ ( + " movq (%0), %%mm0\n" + " movq 8(%0), %%mm1\n" + " movq 16(%0), %%mm2\n" + " movq 24(%0), %%mm3\n" + " movq %%mm0, (%1)\n" + " movq %%mm1, 8(%1)\n" + " movq %%mm2, 16(%1)\n" + " movq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm0\n" + " movq 40(%0), %%mm1\n" + " movq 48(%0), %%mm2\n" + " movq 56(%0), %%mm3\n" + " movq %%mm0, 32(%1)\n" + " movq %%mm1, 40(%1)\n" + " movq %%mm2, 48(%1)\n" + " movq %%mm3, 56(%1)\n" + : : "r" (from), "r" (to) : "memory"); + from+=64; + to+=64; + } + /* + * Now do the tail of the block + */ + __memcpy(to, from, len&63); + kernel_fpu_end(); + return p; +} + +#ifdef CONFIG_MK7 + +/* + * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and + * other MMX using processors do not. + */ + +static void fast_clear_page(void *page) +{ + int i; + + kernel_fpu_begin(); + + __asm__ __volatile__ ( + " pxor %%mm0, %%mm0\n" : : + ); + + for(i=0;i<4096/64;i++) + { + __asm__ __volatile__ ( + " movntq %%mm0, (%0)\n" + " movntq %%mm0, 8(%0)\n" + " movntq %%mm0, 16(%0)\n" + " movntq %%mm0, 24(%0)\n" + " movntq %%mm0, 32(%0)\n" + " movntq %%mm0, 40(%0)\n" + " movntq %%mm0, 48(%0)\n" + " movntq %%mm0, 56(%0)\n" + : : "r" (page) : "memory"); + page+=64; + } + /* since movntq is weakly-ordered, a "sfence" is needed to become + * ordered again. + */ + __asm__ __volatile__ ( + " sfence \n" : : + ); + kernel_fpu_end(); +} + +static void fast_copy_page(void *to, void *from) +{ + int i; + + kernel_fpu_begin(); + + /* maybe the prefetch stuff can go before the expensive fnsave... + * but that is for later. -AV + */ + __asm__ __volatile__ ( + "1: prefetch (%0)\n" + " prefetch 64(%0)\n" + " prefetch 128(%0)\n" + " prefetch 192(%0)\n" + " prefetch 256(%0)\n" + "2: \n" + ".section .fixup, \"ax\"\n" + "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from) ); + + for(i=0; i<(4096-320)/64; i++) + { + __asm__ __volatile__ ( + "1: prefetch 320(%0)\n" + "2: movq (%0), %%mm0\n" + " movntq %%mm0, (%1)\n" + " movq 8(%0), %%mm1\n" + " movntq %%mm1, 8(%1)\n" + " movq 16(%0), %%mm2\n" + " movntq %%mm2, 16(%1)\n" + " movq 24(%0), %%mm3\n" + " movntq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm4\n" + " movntq %%mm4, 32(%1)\n" + " movq 40(%0), %%mm5\n" + " movntq %%mm5, 40(%1)\n" + " movq 48(%0), %%mm6\n" + " movntq %%mm6, 48(%1)\n" + " movq 56(%0), %%mm7\n" + " movntq %%mm7, 56(%1)\n" + ".section .fixup, \"ax\"\n" + "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from), "r" (to) : "memory"); + from+=64; + to+=64; + } + for(i=(4096-320)/64; i<4096/64; i++) + { + __asm__ __volatile__ ( + "2: movq (%0), %%mm0\n" + " movntq %%mm0, (%1)\n" + " movq 8(%0), %%mm1\n" + " movntq %%mm1, 8(%1)\n" + " movq 16(%0), %%mm2\n" + " movntq %%mm2, 16(%1)\n" + " movq 24(%0), %%mm3\n" + " movntq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm4\n" + " movntq %%mm4, 32(%1)\n" + " movq 40(%0), %%mm5\n" + " movntq %%mm5, 40(%1)\n" + " movq 48(%0), %%mm6\n" + " movntq %%mm6, 48(%1)\n" + " movq 56(%0), %%mm7\n" + " movntq %%mm7, 56(%1)\n" + : : "r" (from), "r" (to) : "memory"); + from+=64; + to+=64; + } + /* since movntq is weakly-ordered, a "sfence" is needed to become + * ordered again. + */ + __asm__ __volatile__ ( + " sfence \n" : : + ); + kernel_fpu_end(); +} + +#else + +/* + * Generic MMX implementation without K7 specific streaming + */ + +static void fast_clear_page(void *page) +{ + int i; + + kernel_fpu_begin(); + + __asm__ __volatile__ ( + " pxor %%mm0, %%mm0\n" : : + ); + + for(i=0;i<4096/128;i++) + { + __asm__ __volatile__ ( + " movq %%mm0, (%0)\n" + " movq %%mm0, 8(%0)\n" + " movq %%mm0, 16(%0)\n" + " movq %%mm0, 24(%0)\n" + " movq %%mm0, 32(%0)\n" + " movq %%mm0, 40(%0)\n" + " movq %%mm0, 48(%0)\n" + " movq %%mm0, 56(%0)\n" + " movq %%mm0, 64(%0)\n" + " movq %%mm0, 72(%0)\n" + " movq %%mm0, 80(%0)\n" + " movq %%mm0, 88(%0)\n" + " movq %%mm0, 96(%0)\n" + " movq %%mm0, 104(%0)\n" + " movq %%mm0, 112(%0)\n" + " movq %%mm0, 120(%0)\n" + : : "r" (page) : "memory"); + page+=128; + } + + kernel_fpu_end(); +} + +static void fast_copy_page(void *to, void *from) |