diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/x86_64/lib |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/x86_64/lib')
-rw-r--r-- | arch/x86_64/lib/Makefile | 14 | ||||
-rw-r--r-- | arch/x86_64/lib/bitops.c | 141 | ||||
-rw-r--r-- | arch/x86_64/lib/bitstr.c | 28 | ||||
-rw-r--r-- | arch/x86_64/lib/clear_page.S | 50 | ||||
-rw-r--r-- | arch/x86_64/lib/copy_page.S | 101 | ||||
-rw-r--r-- | arch/x86_64/lib/copy_user.S | 294 | ||||
-rw-r--r-- | arch/x86_64/lib/csum-copy.S | 233 | ||||
-rw-r--r-- | arch/x86_64/lib/csum-partial.c | 150 | ||||
-rw-r--r-- | arch/x86_64/lib/csum-wrappers.c | 129 | ||||
-rw-r--r-- | arch/x86_64/lib/dec_and_lock.c | 40 | ||||
-rw-r--r-- | arch/x86_64/lib/delay.c | 48 | ||||
-rw-r--r-- | arch/x86_64/lib/getuser.S | 101 | ||||
-rw-r--r-- | arch/x86_64/lib/io.c | 23 | ||||
-rw-r--r-- | arch/x86_64/lib/memcpy.S | 121 | ||||
-rw-r--r-- | arch/x86_64/lib/memmove.c | 19 | ||||
-rw-r--r-- | arch/x86_64/lib/memset.S | 125 | ||||
-rw-r--r-- | arch/x86_64/lib/putuser.S | 89 | ||||
-rw-r--r-- | arch/x86_64/lib/thunk.S | 95 | ||||
-rw-r--r-- | arch/x86_64/lib/usercopy.c | 153 |
19 files changed, 1954 insertions, 0 deletions
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile new file mode 100644 index 00000000000..6b26a1c1e9f --- /dev/null +++ b/arch/x86_64/lib/Makefile @@ -0,0 +1,14 @@ +# +# Makefile for x86_64-specific library files. +# + +CFLAGS_csum-partial.o := -funroll-loops + +obj-y := io.o + +lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ + usercopy.o getuser.o putuser.o \ + thunk.o clear_page.o copy_page.o bitstr.o bitops.o +lib-y += memcpy.o memmove.o memset.o copy_user.o + +lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c new file mode 100644 index 00000000000..a29fb75b33a --- /dev/null +++ b/arch/x86_64/lib/bitops.c @@ -0,0 +1,141 @@ +#include <linux/bitops.h> + +#undef find_first_zero_bit +#undef find_next_zero_bit +#undef find_first_bit +#undef find_next_bit + +/** + * find_first_zero_bit - find the first zero bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first zero bit, not the number of the byte + * containing a bit. + */ +inline long find_first_zero_bit(const unsigned long * addr, unsigned long size) +{ + long d0, d1, d2; + long res; + + if (!size) + return 0; + asm volatile( + " repe; scasq\n" + " je 1f\n" + " xorq -8(%%rdi),%%rax\n" + " subq $8,%%rdi\n" + " bsfq %%rax,%%rdx\n" + "1: subq %[addr],%%rdi\n" + " shlq $3,%%rdi\n" + " addq %%rdi,%%rdx" + :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2) + :"0" (0ULL), "1" ((size + 63) >> 6), "2" (addr), "3" (-1ULL), + [addr] "r" (addr) : "memory"); + return res; +} + +/** + * find_next_zero_bit - find the first zero bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +long find_next_zero_bit (const unsigned long * addr, long size, long offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 6); + unsigned long set = 0; + unsigned long res, bit = offset&63; + + if (bit) { + /* + * Look for zero in first word + */ + asm("bsfq %1,%0\n\t" + "cmoveq %2,%0" + : "=r" (set) + : "r" (~(*p >> bit)), "r"(64L)); + if (set < (64 - bit)) + return set + offset; + set = 64 - bit; + p++; + } + /* + * No zero yet, search remaining full words for a zero + */ + res = find_first_zero_bit ((const unsigned long *)p, + size - 64 * (p - (unsigned long *) addr)); + return (offset + set + res); +} + +static inline long +__find_first_bit(const unsigned long * addr, unsigned long size) +{ + long d0, d1; + long res; + + asm volatile( + " repe; scasq\n" + " jz 1f\n" + " subq $8,%%rdi\n" + " bsfq (%%rdi),%%rax\n" + "1: subq %[addr],%%rdi\n" + " shlq $3,%%rdi\n" + " addq %%rdi,%%rax" + :"=a" (res), "=&c" (d0), "=&D" (d1) + :"0" (0ULL), + "1" ((size + 63) >> 6), "2" (addr), + [addr] "r" (addr) : "memory"); + return res; +} + +/** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +long find_first_bit(const unsigned long * addr, unsigned long size) +{ + return __find_first_bit(addr,size); +} + +/** + * find_next_bit - find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +long find_next_bit(const unsigned long * addr, long size, long offset) +{ + const unsigned long * p = addr + (offset >> 6); + unsigned long set = 0, bit = offset & 63, res; + + if (bit) { + /* + * Look for nonzero in the first 64 bits: + */ + asm("bsfq %1,%0\n\t" + "cmoveq %2,%0\n\t" + : "=r" (set) + : "r" (*p >> bit), "r" (64L)); + if (set < (64 - bit)) + return set + offset; + set = 64 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = __find_first_bit (p, size - 64 * (p - addr)); + return (offset + set + res); +} + +#include <linux/module.h> + +EXPORT_SYMBOL(find_next_bit); +EXPORT_SYMBOL(find_first_bit); +EXPORT_SYMBOL(find_first_zero_bit); +EXPORT_SYMBOL(find_next_zero_bit); diff --git a/arch/x86_64/lib/bitstr.c b/arch/x86_64/lib/bitstr.c new file mode 100644 index 00000000000..24676609a6a --- /dev/null +++ b/arch/x86_64/lib/bitstr.c @@ -0,0 +1,28 @@ +#include <linux/module.h> +#include <linux/bitops.h> + +/* Find string of zero bits in a bitmap */ +unsigned long +find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len) +{ + unsigned long n, end, i; + + again: + n = find_next_zero_bit(bitmap, nbits, start); + if (n == -1) + return -1; + + /* could test bitsliced, but it's hardly worth it */ + end = n+len; + if (end >= nbits) + return -1; + for (i = n+1; i < end; i++) { + if (test_bit(i, bitmap)) { + start = i+1; + goto again; + } + } + return n; +} + +EXPORT_SYMBOL(find_next_zero_string); diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S new file mode 100644 index 00000000000..30a9da458c1 --- /dev/null +++ b/arch/x86_64/lib/clear_page.S @@ -0,0 +1,50 @@ +/* + * Zero a page. + * rdi page + */ + .globl clear_page + .p2align 4 +clear_page: + xorl %eax,%eax + movl $4096/64,%ecx + .p2align 4 +.Lloop: + decl %ecx +#define PUT(x) movq %rax,x*8(%rdi) + movq %rax,(%rdi) + PUT(1) + PUT(2) + PUT(3) + PUT(4) + PUT(5) + PUT(6) + PUT(7) + leaq 64(%rdi),%rdi + jnz .Lloop + nop + ret +clear_page_end: + + /* C stepping K8 run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include <asm/cpufeature.h> + + .section .altinstructions,"a" + .align 8 + .quad clear_page + .quad clear_page_c + .byte X86_FEATURE_K8_C + .byte clear_page_end-clear_page + .byte clear_page_c_end-clear_page_c + .previous + + .section .altinstr_replacement,"ax" +clear_page_c: + movl $4096/8,%ecx + xorl %eax,%eax + rep + stosq + ret +clear_page_c_end: + .previous diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S new file mode 100644 index 00000000000..dd3aa47b6bf --- /dev/null +++ b/arch/x86_64/lib/copy_page.S @@ -0,0 +1,101 @@ +/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ + +/* Don't use streaming store because it's better when the target + ends up in cache. */ + +/* Could vary the prefetch distance based on SMP/UP */ + + .globl copy_page + .p2align 4 +copy_page: + subq $3*8,%rsp + movq %rbx,(%rsp) + movq %r12,1*8(%rsp) + movq %r13,2*8(%rsp) + + movl $(4096/64)-5,%ecx + .p2align 4 +.Loop64: + dec %rcx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + prefetcht0 5*64(%rsi) + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi + + jnz .Loop64 + + movl $5,%ecx + .p2align 4 +.Loop2: + decl %ecx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64(%rdi),%rdi + leaq 64(%rsi),%rsi + + jnz .Loop2 + + movq (%rsp),%rbx + movq 1*8(%rsp),%r12 + movq 2*8(%rsp),%r13 + addq $3*8,%rsp + ret + + /* C stepping K8 run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + +#include <asm/cpufeature.h> + + .section .altinstructions,"a" + .align 8 + .quad copy_page + .quad copy_page_c + .byte X86_FEATURE_K8_C + .byte copy_page_c_end-copy_page_c + .byte copy_page_c_end-copy_page_c + .previous + + .section .altinstr_replacement,"ax" +copy_page_c: + movl $4096/8,%ecx + rep + movsq + ret +copy_page_c_end: + .previous diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S new file mode 100644 index 00000000000..bd556c80424 --- /dev/null +++ b/arch/x86_64/lib/copy_user.S @@ -0,0 +1,294 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License v2. + * + * Functions to copy from and to user space. + */ + +#define FIX_ALIGNMENT 1 + + #include <asm/current.h> + #include <asm/offset.h> + #include <asm/thread_info.h> + #include <asm/cpufeature.h> + +/* Standard copy_to_user with segment limit checking */ + .globl copy_to_user + .p2align 4 +copy_to_user: + GET_THREAD_INFO(%rax) + movq %rdi,%rcx + addq %rdx,%rcx + jc bad_to_user + cmpq threadinfo_addr_limit(%rax),%rcx + jae bad_to_user +2: + .byte 0xe9 /* 32bit jump */ + .long .Lcug-1f +1: + + .section .altinstr_replacement,"ax" +3: .byte 0xe9 /* replacement jmp with 8 bit immediate */ + .long copy_user_generic_c-1b /* offset */ + .previous + .section .altinstructions,"a" + .align 8 + .quad 2b + .quad 3b + .byte X86_FEATURE_K8_C + .byte 5 + .byte 5 + .previous + +/* Standard copy_from_user with segment limit checking */ + .globl copy_from_user + .p2align 4 +copy_from_user: + GET_THREAD_INFO(%rax) + movq %rsi,%rcx + addq %rdx,%rcx + jc bad_from_user + cmpq threadinfo_addr_limit(%rax),%rcx + jae bad_from_user + /* FALL THROUGH to copy_user_generic */ + + .section .fixup,"ax" + /* must zero dest */ +bad_from_user: + movl %edx,%ecx + xorl %eax,%eax + rep + stosb +bad_to_user: + movl %edx,%eax + ret + .previous + + +/* + * copy_user_generic - memory copy with exception handling. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ + .globl copy_user_generic + .p2align 4 +copy_user_generic: + .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ + .byte 0x66,0x90 +1: + .section .altinstr_replacement,"ax" +2: .byte 0xe9 /* near jump with 32bit immediate */ + .long copy_user_generic_c-1b /* offset */ + .previous + .section .altinstructions,"a" + .align 8 + .quad copy_user_generic + .quad 2b + .byte X86_FEATURE_K8_C + .byte 5 + .byte 5 + .previous +.Lcug: + pushq %rbx + xorl %eax,%eax /*zero for the exception handler */ + +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jnz .Lbad_alignment +.Lafter_bad_alignment: +#endif + + movq %rdx,%rcx + + movl $64,%ebx + shrq $6,%rdx + decq %rdx + js .Lhandle_tail + + .p2align 4 +.Lloop: +.Ls1: movq (%rsi),%r11 +.Ls2: movq 1*8(%rsi),%r8 +.Ls3: movq 2*8(%rsi),%r9 +.Ls4: movq 3*8(%rsi),%r10 +.Ld1: movq %r11,(%rdi) +.Ld2: movq %r8,1*8(%rdi) +.Ld3: movq %r9,2*8(%rdi) +.Ld4: movq %r10,3*8(%rdi) + +.Ls5: movq 4*8(%rsi),%r11 +.Ls6: movq 5*8(%rsi),%r8 +.Ls7: movq 6*8(%rsi),%r9 +.Ls8: movq 7*8(%rsi),%r10 +.Ld5: movq %r11,4*8(%rdi) +.Ld6: movq %r8,5*8(%rdi) +.Ld7: movq %r9,6*8(%rdi) +.Ld8: movq %r10,7*8(%rdi) + + decq %rdx + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + + jns .Lloop + + .p2align 4 +.Lhandle_tail: + movl %ecx,%edx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + movl $8,%ebx + .p2align 4 +.Lloop_8: +.Ls9: movq (%rsi),%r8 +.Ld9: movq %r8,(%rdi) + decl %ecx + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: +.Ls10: movb (%rsi),%bl +.Ld10: movb %bl,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + +.Lende: + popq %rbx + ret + +#ifdef FIX_ALIGNMENT + /* align destination */ + .p2align 4 +.Lbad_alignment: + movl $8,%r9d + subl %ecx,%r9d + movl %r9d,%ecx + cmpq %r9,%rdx + jz .Lhandle_7 + js .Lhandle_7 +.Lalign_1: +.Ls11: movb (%rsi),%bl +.Ld11: movb %bl,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .Lalign_1 + subq %r9,%rdx + jmp .Lafter_bad_alignment +#endif + + /* table sorted by exception address */ + .section __ex_table,"a" + .align 8 + .quad .Ls1,.Ls1e + .quad .Ls2,.Ls2e + .quad .Ls3,.Ls3e + .quad .Ls4,.Ls4e + .quad .Ld1,.Ls1e + .quad .Ld2,.Ls2e + .quad .Ld3,.Ls3e + .quad .Ld4,.Ls4e + .quad .Ls5,.Ls5e + .quad .Ls6,.Ls6e + .quad .Ls7,.Ls7e + .quad .Ls8,.Ls8e + .quad .Ld5,.Ls5e + .quad .Ld6,.Ls6e + .quad .Ld7,.Ls7e + .quad .Ld8,.Ls8e + .quad .Ls9,.Le_quad + .quad .Ld9,.Le_quad + .quad .Ls10,.Le_byte + .quad .Ld10,.Le_byte +#ifdef FIX_ALIGNMENT + .quad .Ls11,.Lzero_rest + .quad .Ld11,.Lzero_rest +#endif + .quad .Le5,.Le_zero + .previous + + /* compute 64-offset for main loop. 8 bytes accuracy with error on the + pessimistic side. this is gross. it would be better to fix the + interface. */ + /* eax: zero, ebx: 64 */ +.Ls1e: addl $8,%eax +.Ls2e: addl $8,%eax +.Ls3e: addl $8,%eax +.Ls4e: addl $8,%eax +.Ls5e: addl $8,%eax +.Ls6e: addl $8,%eax +.Ls7e: addl $8,%eax +.Ls8e: addl $8,%eax + addq %rbx,%rdi /* +64 */ + subq %rax,%rdi /* correct destination with computed offset */ + + shlq $6,%rdx /* loop counter * 64 (stride length) */ + addq %rax,%rdx /* add offset to loopcnt */ + andl $63,%ecx /* remaining bytes */ + addq %rcx,%rdx /* add them */ + jmp .Lzero_rest + + /* exception on quad word loop in tail handling */ + /* ecx: loopcnt/8, %edx: length, rdi: correct */ +.Le_quad: + shll $3,%ecx + andl $7,%edx + addl %ecx,%edx + /* edx: bytes to zero, rdi: dest, eax:zero */ +.Lzero_rest: + movq %rdx,%rcx +.Le_byte: + xorl %eax,%eax +.Le5: rep + stosb + /* when there is another exception while zeroing the rest just return */ +.Le_zero: + movq %rdx,%rax + jmp .Lende + + /* C stepping K8 run faster using the string copy instructions. + This is also a lot simpler. Use them when possible. + Patch in jmps to this code instead of copying it fully + to avoid unwanted aliasing in the exception tables. */ + + /* rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successfull. + */ +copy_user_generic_c: + movl %edx,%ecx + shrl $3,%ecx + andl $7,%edx +1: rep + movsq + movl %edx,%ecx +2: rep + movsb +4: movl %ecx,%eax + ret +3: lea (%rdx,%rcx,8),%rax + ret + + .section __ex_table,"a" + .quad 1b,3b + .quad 2b,4b + .previous diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S new file mode 100644 index 00000000000..01808ec3783 --- /dev/null +++ b/arch/x86_64/lib/csum-copy.S @@ -0,0 +1,233 @@ +/* + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive + * for more details. No warranty for anything given at all. + */ + #include <linux/linkage.h> + #include <asm/errno.h> + +/* + * Checksum copy with exception handling. + * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the + * destination is zeroed. + * + * Input + * rdi source + * rsi destination + * edx len (32bit) + * ecx sum (32bit) + * r8 src_err_ptr (int) + * r9 dst_err_ptr (int) + * + * Output + * eax 64bit sum. undefined in case of exception. + * + * Wrappers need to take care of valid exception sum and zeroing. + * They also should align source or destination to 8 bytes. + */ + + .macro source +10: + .section __ex_table,"a" + .align 8 + .quad 10b,.Lbad_source + .previous + .endm + + .macro dest +20: + .section __ex_table,"a" + .align 8 + .quad 20b,.Lbad_dest + .previous + .endm + + .macro ignore L=.Lignore +30: + .section __ex_table,"a" + .align 8 + .quad 30b,\L + .previous + .endm + + + .globl csum_partial_copy_generic + .p2align 4 +csum_partial_copy_generic: + cmpl $3*64,%edx + jle .Lignore + +.Lignore: + subq $7*8,%rsp + movq %rbx,2*8(%rsp) + movq %r12,3*8(%rsp) + movq %r14,4*8(%rsp) + movq %r13,5*8(%rsp) + movq %rbp,6*8(%rsp) + + movq %r8,(%rsp) + movq %r9,1*8(%rsp) + + movl %ecx,%eax + movl %edx,%ecx + + xorl %r9d,%r9d + movq %rcx,%r12 + + shrq $6,%r12 + jz .Lhandle_tail /* < 64 */ + + clc + + /* main loop. clear in 64 byte blocks */ + /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ + /* r11: temp3, rdx: temp4, r12 loopcnt */ + /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */ + .p2align 4 +.Lloop: + source + movq (%rdi),%rbx + source + movq 8(%rdi),%r8 + source + movq 16(%rdi),%r11 + source + movq 24(%rdi),%rdx + + source + movq 32(%rdi),%r10 + source + movq 40(%rdi),%rbp + source + movq 48(%rdi),%r14 + source + movq 56(%rdi),%r13 + + ignore 2f + prefetcht0 5*64(%rdi) +2: + adcq %rbx,%rax + adcq %r8,%rax + adcq %r11,%rax + adcq %rdx,%rax + adcq %r10,%rax + adcq %rbp,%rax + adcq %r14,%rax + adcq %r13,%rax + + decl %r12d + + dest + movq %rbx,(%rsi) + dest + movq %r8,8(%rsi) + dest + movq %r11,16(%rsi) + dest + movq %rdx,24(%rsi) + + dest + movq %r10,32(%rsi) + dest + movq %rbp,40(%rsi) + dest + movq %r14,48(%rsi) + dest + movq %r13,56(%rsi) + +3: + + leaq 64(%rdi),%rdi + leaq 64(%rsi),%rsi + + jnz .Lloop + + adcq %r9,%rax + + /* do last upto 56 bytes */ +.Lhandle_tail: + /* ecx: count */ + movl %ecx,%r10d + andl $63,%ecx + shrl $3,%ecx + jz .Lfold + clc + .p2align 4 +.Lloop_8: + source + movq (%rdi),%rbx + adcq %rbx,%rax + decl %ecx + dest + movq %rbx,(%rsi) + leaq 8(%rsi),%rsi /* preserve carry */ + leaq 8(%rdi),%rdi + jnz .Lloop_8 + adcq %r9,%rax /* add in carry */ + +.Lfold: + /* reduce checksum to 32bits */ + movl %eax,%ebx + shrq $32,%rax + addl %ebx,%eax + adcl %r9d,%eax + + /* do last upto 6 bytes */ +.Lhandle_7: + movl %r10d,%ecx + andl $7,%ecx + shrl $1,%ecx + jz .Lhandle_1 + movl $2,%edx + xorl %ebx,%ebx + clc + .p2align 4 +.Lloop_1: + source + movw (%rdi),%bx + adcl %ebx,%eax + dest + decl %ecx + movw %bx,(%rsi) + leaq 2(%rdi),%rdi + leaq 2(%rsi),%rsi + jnz .Lloop_1 + adcl %r9d,%eax /* add in carry */ + + /* handle last odd byte */ +.Lhandle_1: + testl $1,%r10d + jz .Lende + xorl %ebx,%ebx + source + movb (%rdi),%bl + dest + movb %bl,(%rsi) + addl %ebx,%eax + adcl %r9d,%eax /* carry */ + +.Lende: + movq 2*8(%rsp),%rbx + movq 3*8(%rsp),%r12 + movq 4*8(%rsp),%r14 + movq 5*8(%rsp),%r13 + movq 6*8(%rsp),%rbp + addq $7*8,%rsp + ret + + /* Exception handlers. Very simple, zeroing is done in the wrappers */ +.Lbad_source: + movq (%rsp),%rax + testq %rax,%rax + jz .Lende + movl $-EFAULT,(%rax) + jmp .Lende + +.Lbad_dest: + movq 8(%rsp),%rax + testq %rax,%rax + jz .Lende + movl $-EFAULT,(%rax) + jmp .Lende diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c new file mode 100644 index 00000000000..5384e227cdf --- /dev/null +++ b/arch/x86_64/lib/csum-partial.c @@ -0,0 +1,150 @@ +/* + * arch/x86_64/lib/csum-partial.c + * + * This file contains network checksum routines that are better done + * in an architecture-specific manner due to speed. + */ + +#include <linux/compiler.h> +#include <linux/module.h> +#include <asm/checksum.h> + +#define __force_inline inline __attribute__((always_inline)) + +static inline unsigned short from32to16(unsigned a) +{ + unsigned short b = a >> 16; + asm("addw %w2,%w0\n\t" + "adcw $0,%w0\n" + : "=r" (b) + : "0" (b), "r" (a)); + return b; +} + +/* + * Do a 64-bit checksum on an arbitrary memory area. + * Returns a 32bit checksum. + * + * This isn't as time critical as it used to be because many NICs + * do hardware checksumming these days. + * + * Things tried and found to not make it faster: + * Manual Prefetching + * Unrolling to an 128 bytes inner loop. + * Using interleaving with more registers to break the carry chains. + */ +static __force_inline unsigned do_csum(const unsigned char *buff, unsigned len) +{ + unsigned odd, count; + unsigned long result = 0; + + if (unlikely(len == 0)) + return result; + odd = 1 & (unsigned long) buff; + if (unlikely(odd)) { + result = *buff << 8; + len--; + buff++; + } + count = len >> 1; /* nr of 16-bit words.. */ + if (count) { + if (2 & (unsigned long) buff) { + result += *(unsigned short *)buff; + count--; + len -= 2; + buff += 2; + } + count >>= 1; /* nr of 32-bit words.. */ + if (count) { + unsigned long zero; + unsigned count64; + if (4 & (unsigned long) buff) { + result += *(unsigned int *) buff; + count--; + len -= 4; + buff += 4; + } + count >>= 1; /* nr of 64-bit words.. */ + + /* main loop using 64byte blocks */ + zero = 0; + count64 = count >> 3; + while (count64) { + asm("addq 0*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq 2*8(%[src]),%[res]\n\t" + "adcq 3*8(%[src]),%[res]\n\t" + "adcq 4*8(%[src]),%[res]\n\t" + "adcq 5*8(%[src]),%[res]\n\t" + "adcq 6*8(%[src]),%[res]\n\t" + "adcq 7*8(%[src]),%[res]\n\t" + "adcq %[zero],%[res]" + : [res] "=r" (result) + : [src] "r" (buff), [zero] "r" (zero), + "[res]" (result)); + buff += 64; + count64--; + } + + /* last upto 7 8byte blocks */ + count %= 8; + while (count) { + asm("addq %1,%0\n\t" + "adcq %2,%0\n" + : "=r" (result) + : "m" (*(unsigned long *)buff), + "r" (zero), "0" (result)); + --count; + buff += 8; + } + result = add32_with_carry(result>>32, + result&0xffffffff); + + if (len & 4) { + result += *(unsigned int *) buff; + buff += 4; + } + } + if (len & 2) { + result += *(unsigned short *) buff; + buff += 2; + } + } + if (len & 1) + result += *buff; + result = add32_with_carry(result>>32, result & 0xffffffff); + if (unlikely(odd)) { + result = from32to16(result); + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); + } + return result; +} + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 64-bit boundary + */ +unsigned csum_partial(const unsigned char *buff, unsigned len, unsigned sum) +{ + return add32_with_carry(do_csum(buff, len), sum); +} + +EXPORT_SYMBOL(csum_partial); + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ +unsigned short ip_compute_csum(unsigned char * buff, int len) +{ + return csum_fold(csum_partial(buff,len,0)); +} + diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c new file mode 100644 index 00000000000..94323f20816 --- /dev/null +++ b/arch/x86_64/lib/csum-wrappers.c @@ -0,0 +1,129 @@ +/* Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License v.2 + * + * Wrappers of assembly checksum functions for x86-64. + */ + +#include <asm/checksum.h> +#include <linux/module.h> + +/** + * csum_partial_copy_from_user - Copy and checksum from user space. + * @src: source address (user space) + * @dst: destination address + * @len: number of bytes to be copied. + * @isum: initial sum that is added into the result (32bit unfolded) + * @errp: set to -EFAULT for an bad source address. + * + * Returns an 32bit unfolded checksum of the buffer. + * src and dst are best aligned to 64bits. + */ +unsigned int +csum_partial_copy_from_user(const unsigned char __user *src, unsigned char *dst, + int len, unsigned int isum, int *errp) +{ + might_sleep(); + *errp = 0; + if (likely(access_ok(VERIFY_READ,src, len))) { + /* Why 6, not 7? To handle odd addresses aligned we + would need to do considerable complications to fix the + checksum which is defined as an 16bit accumulator. The + fix alignment code is primarily for performance + compatibility with 32bit and that will handle odd + addresses slowly too. */ + if (unlikely((unsigned long)src & 6)) { + while (((unsigned long)src & 6) && len >= 2) { + __u16 val16; + *errp = __get_user(val16, (__u16 __user *)src); + if (*errp) + return isum; + *(__u16 *)dst = val16; + isum = add32_with_carry(isum, val16); + src += 2; + dst += 2; + len -= 2; + } + } + isum = csum_partial_copy_generic((__force void *)src,dst,len,isum,errp,NULL); + if (likely(*errp == 0)) + return isum; + } + *errp = -EFAULT; + memset(dst,0,len); + return isum; +} + +EXPORT_SYMBOL(csum_partial_copy_from_user); + +/** + * csum_partial_copy_to_user - Copy and checksum to user space. + * @src: source address + * @dst: destination address (user space) + * @len: number of bytes to be copied. + * @isum: initial sum that is added into the result (32bit unfolded) + * @errp: set to -EFAULT for an bad destination address. + * + * Returns an 32bit unfolded checksum of the buffer. + * src and dst are best aligned to 64bits. + */ +unsigned int +csum_partial_copy_to_user(unsigned const char *src, unsigned char __user *dst, + int len, unsigned int isum, int *errp) +{ + might_sleep(); + if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { + *errp = -EFAULT; + return 0; + } + + if (unlikely((unsigned long)dst & 6)) { + while (((unsigned long)dst &am |