Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/x86_64/lib
19 files changed, 1954 insertions, 0 deletions
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
new file mode 100644
index 00000000000..6b26a1c1e9f
--- /dev/null
+++ b/arch/x86_64/lib/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for x86_64-specific library files.
+#
+
+CFLAGS_csum-partial.o := -funroll-loops
+
+obj-y := io.o
+
+lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
+	usercopy.o getuser.o putuser.o  \
+	thunk.o clear_page.o copy_page.o bitstr.o bitops.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o
+
+lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c
new file mode 100644
index 00000000000..a29fb75b33a
--- /dev/null
+++ b/arch/x86_64/lib/bitops.c
@@ -0,0 +1,141 @@
+#include <linux/bitops.h>
+
+#undef find_first_zero_bit
+#undef find_next_zero_bit
+#undef find_first_bit
+#undef find_next_bit
+
+/**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+	long d0, d1, d2;
+	long res;
+
+	if (!size)
+		return 0;
+	asm volatile(
+		"  repe; scasq\n"
+		"  je 1f\n"
+		"  xorq -8(%%rdi),%%rax\n"
+		"  subq $8,%%rdi\n"
+		"  bsfq %%rax,%%rdx\n"
+		"1:  subq %[addr],%%rdi\n"
+		"  shlq $3,%%rdi\n"
+		"  addq %%rdi,%%rdx"
+		:"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
+		:"0" (0ULL), "1" ((size + 63) >> 6), "2" (addr), "3" (-1ULL),
+		 [addr] "r" (addr) : "memory");
+	return res;
+}
+
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_zero_bit (const unsigned long * addr, long size, long offset)
+{
+	unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
+	unsigned long set = 0;
+	unsigned long res, bit = offset&63;
+
+	if (bit) {
+		/*
+		 * Look for zero in first word
+		 */
+		asm("bsfq %1,%0\n\t"
+		    "cmoveq %2,%0"
+		    : "=r" (set)
+		    : "r" (~(*p >> bit)), "r"(64L));
+		if (set < (64 - bit))
+			return set + offset;
+		set = 64 - bit;
+		p++;
+	}
+	/*
+	 * No zero yet, search remaining full words for a zero
+	 */
+	res = find_first_zero_bit ((const unsigned long *)p,
+				   size - 64 * (p - (unsigned long *) addr));
+	return (offset + set + res);
+}
+
+static inline long
+__find_first_bit(const unsigned long * addr, unsigned long size)
+{
+	long d0, d1;
+	long res;
+
+	asm volatile(
+		"   repe; scasq\n"
+		"   jz 1f\n"
+		"   subq $8,%%rdi\n"
+		"   bsfq (%%rdi),%%rax\n"
+		"1: subq %[addr],%%rdi\n"
+		"   shlq $3,%%rdi\n"
+		"   addq %%rdi,%%rax"
+		:"=a" (res), "=&c" (d0), "=&D" (d1)
+		:"0" (0ULL),
+		 "1" ((size + 63) >> 6), "2" (addr),
+		 [addr] "r" (addr) : "memory");
+	return res;
+}
+
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_bit(const unsigned long * addr, unsigned long size)
+{
+	return __find_first_bit(addr,size);
+}
+
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_bit(const unsigned long * addr, long size, long offset)
+{
+	const unsigned long * p = addr + (offset >> 6);
+	unsigned long set = 0, bit = offset & 63, res;
+
+	if (bit) {
+		/*
+		 * Look for nonzero in the first 64 bits:
+		 */
+		asm("bsfq %1,%0\n\t"
+		    "cmoveq %2,%0\n\t"
+		    : "=r" (set)
+		    : "r" (*p >> bit), "r" (64L));
+		if (set < (64 - bit))
+			return set + offset;
+		set = 64 - bit;
+		p++;
+	}
+	/*
+	 * No set bit yet, search remaining full words for a bit
+	 */
+	res = __find_first_bit (p, size - 64 * (p - addr));
+	return (offset + set + res);
+}
+
+#include <linux/module.h>
+
+EXPORT_SYMBOL(find_next_bit);
+EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86_64/lib/bitstr.c b/arch/x86_64/lib/bitstr.c
new file mode 100644
index 00000000000..24676609a6a
--- /dev/null
+++ b/arch/x86_64/lib/bitstr.c
@@ -0,0 +1,28 @@
+#include <linux/module.h>
+#include <linux/bitops.h>
+
+/* Find string of zero bits in a bitmap */ 
+unsigned long 
+find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
+{ 
+	unsigned long n, end, i; 	
+
+ again:
+	n = find_next_zero_bit(bitmap, nbits, start);
+	if (n == -1) 
+		return -1;
+	
+	/* could test bitsliced, but it's hardly worth it */
+	end = n+len;
+	if (end >= nbits) 
+		return -1; 
+	for (i = n+1; i < end; i++) { 
+		if (test_bit(i, bitmap)) {  
+			start = i+1; 
+			goto again; 
+		} 
+	}
+	return n;
+}
+
+EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
new file mode 100644
index 00000000000..30a9da458c1
--- /dev/null
+++ b/arch/x86_64/lib/clear_page.S
@@ -0,0 +1,50 @@
+/*
+ * Zero a page. 	
+ * rdi	page
+ */			
+	.globl clear_page
+	.p2align 4
+clear_page:
+	xorl   %eax,%eax
+	movl   $4096/64,%ecx
+	.p2align 4
+.Lloop:
+	decl	%ecx
+#define PUT(x) movq %rax,x*8(%rdi) 
+	movq %rax,(%rdi)
+	PUT(1)
+	PUT(2)
+	PUT(3)
+	PUT(4)
+	PUT(5)
+	PUT(6)
+	PUT(7)
+	leaq	64(%rdi),%rdi
+	jnz	.Lloop
+	nop
+	ret
+clear_page_end:	
+	
+	/* C stepping K8 run faster using the string instructions.
+	   It is also a lot simpler. Use this when possible */
+	
+#include <asm/cpufeature.h>
+	    	
+	.section .altinstructions,"a"
+	.align 8
+	.quad  clear_page
+	.quad  clear_page_c
+	.byte  X86_FEATURE_K8_C
+	.byte  clear_page_end-clear_page	
+	.byte  clear_page_c_end-clear_page_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+clear_page_c:
+	movl $4096/8,%ecx
+	xorl %eax,%eax
+	rep 
+	stosq
+	ret
+clear_page_c_end:
+	.previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
new file mode 100644
index 00000000000..dd3aa47b6bf
--- /dev/null
+++ b/arch/x86_64/lib/copy_page.S
@@ -0,0 +1,101 @@
+/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+	
+/* Don't use streaming store because it's better when the target
+   ends up in cache. */
+	    
+/* Could vary the prefetch distance based on SMP/UP */
+
+	.globl copy_page
+	.p2align 4
+copy_page:
+	subq	$3*8,%rsp
+	movq	%rbx,(%rsp)
+	movq	%r12,1*8(%rsp)
+	movq	%r13,2*8(%rsp)
+			
+	movl	$(4096/64)-5,%ecx
+	.p2align 4
+.Loop64:	
+  	dec     %rcx
+
+	movq        (%rsi), %rax
+	movq      8 (%rsi), %rbx
+	movq     16 (%rsi), %rdx
+	movq     24 (%rsi), %r8
+	movq     32 (%rsi), %r9
+	movq     40 (%rsi), %r10
+	movq     48 (%rsi), %r11
+	movq     56 (%rsi), %r12
+
+	prefetcht0 5*64(%rsi)
+
+	movq     %rax,    (%rdi)
+	movq     %rbx,  8 (%rdi)
+	movq     %rdx, 16 (%rdi)
+	movq     %r8,  24 (%rdi)
+	movq     %r9,  32 (%rdi)
+	movq     %r10, 40 (%rdi)
+	movq     %r11, 48 (%rdi)
+	movq     %r12, 56 (%rdi)
+
+	leaq    64 (%rsi), %rsi
+	leaq    64 (%rdi), %rdi
+
+	jnz     .Loop64
+
+	movl	$5,%ecx
+	.p2align 4
+.Loop2:	
+	decl   %ecx
+
+	movq        (%rsi), %rax
+	movq      8 (%rsi), %rbx
+	movq     16 (%rsi), %rdx
+	movq     24 (%rsi), %r8
+	movq     32 (%rsi), %r9
+	movq     40 (%rsi), %r10
+	movq     48 (%rsi), %r11
+	movq     56 (%rsi), %r12
+
+	movq     %rax,    (%rdi)
+	movq     %rbx,  8 (%rdi)
+	movq     %rdx, 16 (%rdi)
+	movq     %r8,  24 (%rdi)
+	movq     %r9,  32 (%rdi)
+	movq     %r10, 40 (%rdi)
+	movq     %r11, 48 (%rdi)
+	movq     %r12, 56 (%rdi)
+	
+	leaq	64(%rdi),%rdi			
+	leaq	64(%rsi),%rsi			
+	
+	jnz	.Loop2		
+	
+	movq	(%rsp),%rbx
+	movq	1*8(%rsp),%r12
+	movq	2*8(%rsp),%r13
+	addq	$3*8,%rsp
+	ret
+	
+	/* C stepping K8 run faster using the string copy instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>		
+		
+	.section .altinstructions,"a"
+	.align 8
+	.quad  copy_page
+	.quad  copy_page_c
+	.byte  X86_FEATURE_K8_C
+	.byte  copy_page_c_end-copy_page_c
+	.byte  copy_page_c_end-copy_page_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+copy_page_c:
+	movl $4096/8,%ecx
+	rep 
+	movsq 
+	ret
+copy_page_c_end:
+	.previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
new file mode 100644
index 00000000000..bd556c80424
--- /dev/null
+++ b/arch/x86_64/lib/copy_user.S
@@ -0,0 +1,294 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ * 
+ * Functions to copy from and to user space.		
+ */		 
+
+#define FIX_ALIGNMENT 1
+		
+	#include <asm/current.h>
+	#include <asm/offset.h>
+	#include <asm/thread_info.h>
+	#include <asm/cpufeature.h>
+
+/* Standard copy_to_user with segment limit checking */		
+	.globl copy_to_user
+	.p2align 4	
+copy_to_user:
+	GET_THREAD_INFO(%rax)
+	movq %rdi,%rcx
+	addq %rdx,%rcx
+	jc  bad_to_user
+	cmpq threadinfo_addr_limit(%rax),%rcx
+	jae bad_to_user
+2:	
+	.byte 0xe9	/* 32bit jump */
+	.long .Lcug-1f
+1:
+
+	.section .altinstr_replacement,"ax"
+3:	.byte 0xe9			/* replacement jmp with 8 bit immediate */
+	.long copy_user_generic_c-1b	/* offset */
+	.previous
+	.section .altinstructions,"a"
+	.align 8
+	.quad  2b
+	.quad  3b
+	.byte  X86_FEATURE_K8_C
+	.byte  5
+	.byte  5
+	.previous
+
+/* Standard copy_from_user with segment limit checking */	
+	.globl copy_from_user
+	.p2align 4	
+copy_from_user:
+	GET_THREAD_INFO(%rax)
+	movq %rsi,%rcx
+	addq %rdx,%rcx
+	jc  bad_from_user
+	cmpq threadinfo_addr_limit(%rax),%rcx
+	jae  bad_from_user
+	/* FALL THROUGH to copy_user_generic */
+	
+	.section .fixup,"ax"
+	/* must zero dest */
+bad_from_user:
+	movl %edx,%ecx
+	xorl %eax,%eax
+	rep
+	stosb
+bad_to_user:
+	movl	%edx,%eax
+	ret
+	.previous
+	
+		
+/*
+ * copy_user_generic - memory copy with exception handling.
+ * 	
+ * Input:	
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:		
+ * eax uncopied bytes or 0 if successful.
+ */
+	.globl copy_user_generic	
+	.p2align 4
+copy_user_generic:	
+	.byte 0x66,0x66,0x90	/* 5 byte nop for replacement jump */	
+	.byte 0x66,0x90
+1:		
+	.section .altinstr_replacement,"ax"
+2:	.byte 0xe9	             /* near jump with 32bit immediate */
+	.long copy_user_generic_c-1b /* offset */
+	.previous
+	.section .altinstructions,"a"
+	.align 8
+	.quad  copy_user_generic
+	.quad  2b
+	.byte  X86_FEATURE_K8_C
+	.byte  5
+	.byte  5
+	.previous
+.Lcug:	
+	pushq %rbx
+	xorl %eax,%eax		/*zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+	movq %rdx,%rcx
+
+	movl $64,%ebx	
+	shrq $6,%rdx
+	decq %rdx
+	js   .Lhandle_tail
+	
+	.p2align 4
+.Lloop:
+.Ls1:	movq (%rsi),%r11
+.Ls2:	movq 1*8(%rsi),%r8
+.Ls3:	movq 2*8(%rsi),%r9
+.Ls4:	movq 3*8(%rsi),%r10
+.Ld1:	movq %r11,(%rdi)
+.Ld2:	movq %r8,1*8(%rdi)
+.Ld3:	movq %r9,2*8(%rdi)
+.Ld4:	movq %r10,3*8(%rdi)
+		
+.Ls5:	movq 4*8(%rsi),%r11
+.Ls6:	movq 5*8(%rsi),%r8
+.Ls7:	movq 6*8(%rsi),%r9
+.Ls8:	movq 7*8(%rsi),%r10
+.Ld5:	movq %r11,4*8(%rdi)
+.Ld6:	movq %r8,5*8(%rdi)
+.Ld7:	movq %r9,6*8(%rdi)
+.Ld8:	movq %r10,7*8(%rdi)
+	
+	decq %rdx
+
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	
+	jns  .Lloop
+
+	.p2align 4
+.Lhandle_tail:
+	movl %ecx,%edx
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz   .Lhandle_7
+	movl $8,%ebx
+	.p2align 4
+.Lloop_8:
+.Ls9:	movq (%rsi),%r8
+.Ld9:	movq %r8,(%rdi)
+	decl %ecx
+	leaq 8(%rdi),%rdi
+	leaq 8(%rsi),%rsi
+	jnz .Lloop_8
+	
+.Lhandle_7:		
+	movl %edx,%ecx	
+	andl $7,%ecx
+	jz   .Lende
+	.p2align 4
+.Lloop_1:
+.Ls10:	movb (%rsi),%bl
+.Ld10:	movb %bl,(%rdi)
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz .Lloop_1
+			
+.Lende:
+	popq %rbx
+	ret	
+
+#ifdef FIX_ALIGNMENT		  		
+	/* align destination */
+	.p2align 4
+.Lbad_alignment:
+	movl $8,%r9d
+	subl %ecx,%r9d
+	movl %r9d,%ecx
+	cmpq %r9,%rdx
+	jz   .Lhandle_7
+	js   .Lhandle_7
+.Lalign_1:		
+.Ls11:	movb (%rsi),%bl
+.Ld11:	movb %bl,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .Lalign_1
+	subq %r9,%rdx
+	jmp .Lafter_bad_alignment
+#endif
+	
+	/* table sorted by exception address */	
+	.section __ex_table,"a"
+	.align 8
+	.quad .Ls1,.Ls1e
+	.quad .Ls2,.Ls2e
+	.quad .Ls3,.Ls3e
+	.quad .Ls4,.Ls4e	
+	.quad .Ld1,.Ls1e
+	.quad .Ld2,.Ls2e
+	.quad .Ld3,.Ls3e
+	.quad .Ld4,.Ls4e
+	.quad .Ls5,.Ls5e
+	.quad .Ls6,.Ls6e
+	.quad .Ls7,.Ls7e
+	.quad .Ls8,.Ls8e	
+	.quad .Ld5,.Ls5e
+	.quad .Ld6,.Ls6e
+	.quad .Ld7,.Ls7e
+	.quad .Ld8,.Ls8e
+	.quad .Ls9,.Le_quad
+	.quad .Ld9,.Le_quad
+	.quad .Ls10,.Le_byte
+	.quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT	
+	.quad .Ls11,.Lzero_rest
+	.quad .Ld11,.Lzero_rest
+#endif
+	.quad .Le5,.Le_zero
+	.previous
+
+	/* compute 64-offset for main loop. 8 bytes accuracy with error on the 
+	   pessimistic side. this is gross. it would be better to fix the 
+	   interface. */	
+	/* eax: zero, ebx: 64 */
+.Ls1e: 	addl $8,%eax
+.Ls2e: 	addl $8,%eax
+.Ls3e: 	addl $8,%eax
+.Ls4e: 	addl $8,%eax
+.Ls5e: 	addl $8,%eax
+.Ls6e: 	addl $8,%eax
+.Ls7e: 	addl $8,%eax
+.Ls8e: 	addl $8,%eax
+	addq %rbx,%rdi	/* +64 */
+	subq %rax,%rdi  /* correct destination with computed offset */
+
+	shlq $6,%rdx	/* loop counter * 64 (stride length) */
+	addq %rax,%rdx	/* add offset to loopcnt */
+	andl $63,%ecx	/* remaining bytes */
+	addq %rcx,%rdx	/* add them */
+	jmp .Lzero_rest
+
+	/* exception on quad word loop in tail handling */
+	/* ecx:	loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+	shll $3,%ecx
+	andl $7,%edx
+	addl %ecx,%edx
+	/* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+	movq %rdx,%rcx
+.Le_byte:
+	xorl %eax,%eax
+.Le5:	rep 
+	stosb
+	/* when there is another exception while zeroing the rest just return */
+.Le_zero:		
+	movq %rdx,%rax
+	jmp .Lende
+
+	/* C stepping K8 run faster using the string copy instructions.
+	   This is also a lot simpler. Use them when possible.
+	   Patch in jmps to this code instead of copying it fully
+	   to avoid unwanted aliasing in the exception tables. */
+		
+ /* rdi	destination
+  * rsi source
+  * rdx count
+  *
+  * Output:		
+  * eax uncopied bytes or 0 if successfull.
+  */			
+copy_user_generic_c:
+	movl %edx,%ecx
+	shrl $3,%ecx
+	andl $7,%edx	
+1:	rep 
+	movsq 
+	movl %edx,%ecx
+2:	rep
+	movsb
+4:	movl %ecx,%eax
+	ret
+3:	lea (%rdx,%rcx,8),%rax
+	ret
+	
+	.section __ex_table,"a"
+	.quad 1b,3b
+	.quad 2b,4b
+	.previous
diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S
new file mode 100644
index 00000000000..01808ec3783
--- /dev/null
+++ b/arch/x86_64/lib/csum-copy.S
@@ -0,0 +1,233 @@
+/*
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ *	
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details. No warranty for anything given at all.
+ */
+ 	#include <linux/linkage.h>
+	#include <asm/errno.h>
+
+/*
+ * Checksum copy with exception handling.
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 
+ * destination is zeroed.
+ * 
+ * Input
+ * rdi  source
+ * rsi  destination
+ * edx  len (32bit)
+ * ecx  sum (32bit) 
+ * r8   src_err_ptr (int)
+ * r9   dst_err_ptr (int)
+ *
+ * Output
+ * eax  64bit sum. undefined in case of exception.
+ * 
+ * Wrappers need to take care of valid exception sum and zeroing.		 
+ * They also should align source or destination to 8 bytes.
+ */
+
+	.macro source
+10:
+	.section __ex_table,"a"
+	.align 8
+	.quad 10b,.Lbad_source
+	.previous
+	.endm
+		
+	.macro dest
+20:
+	.section __ex_table,"a"
+	.align 8
+	.quad 20b,.Lbad_dest
+	.previous
+	.endm
+			
+	.macro ignore L=.Lignore
+30:
+	.section __ex_table,"a"
+	.align 8
+	.quad 30b,\L
+	.previous
+	.endm
+	
+				
+	.globl csum_partial_copy_generic
+	.p2align 4
+csum_partial_copy_generic:
+	cmpl	 $3*64,%edx
+	jle	 .Lignore
+
+.Lignore:		
+	subq  $7*8,%rsp
+	movq  %rbx,2*8(%rsp)
+	movq  %r12,3*8(%rsp)
+	movq  %r14,4*8(%rsp)
+	movq  %r13,5*8(%rsp)
+	movq  %rbp,6*8(%rsp)
+
+	movq  %r8,(%rsp)
+	movq  %r9,1*8(%rsp)
+	
+	movl  %ecx,%eax
+	movl  %edx,%ecx
+
+	xorl  %r9d,%r9d
+	movq  %rcx,%r12
+
+	shrq  $6,%r12
+	jz    .Lhandle_tail       /* < 64 */
+
+	clc
+	
+	/* main loop. clear in 64 byte blocks */
+	/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
+	/* r11:	temp3, rdx: temp4, r12 loopcnt */
+	/* r10:	temp5, rbp: temp6, r14 temp7, r13 temp8 */
+	.p2align 4
+.Lloop:
+	source
+	movq  (%rdi),%rbx
+	source
+	movq  8(%rdi),%r8
+	source
+	movq  16(%rdi),%r11
+	source
+	movq  24(%rdi),%rdx
+
+	source
+	movq  32(%rdi),%r10
+	source
+	movq  40(%rdi),%rbp
+	source
+	movq  48(%rdi),%r14
+	source
+	movq  56(%rdi),%r13
+		
+	ignore 2f
+	prefetcht0 5*64(%rdi)
+2:							
+	adcq  %rbx,%rax
+	adcq  %r8,%rax
+	adcq  %r11,%rax
+	adcq  %rdx,%rax
+	adcq  %r10,%rax
+	adcq  %rbp,%rax
+	adcq  %r14,%rax
+	adcq  %r13,%rax
+
+	decl %r12d
+	
+	dest
+	movq %rbx,(%rsi)
+	dest
+	movq %r8,8(%rsi)
+	dest
+	movq %r11,16(%rsi)
+	dest
+	movq %rdx,24(%rsi)
+
+	dest
+	movq %r10,32(%rsi)
+	dest
+	movq %rbp,40(%rsi)
+	dest
+	movq %r14,48(%rsi)
+	dest
+	movq %r13,56(%rsi)
+	
+3:
+	
+	leaq 64(%rdi),%rdi
+	leaq 64(%rsi),%rsi
+
+	jnz   .Lloop
+
+	adcq  %r9,%rax
+
+	/* do last upto 56 bytes */
+.Lhandle_tail:
+	/* ecx:	count */
+	movl %ecx,%r10d
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz 	 .Lfold
+	clc
+	.p2align 4
+.Lloop_8:	
+	source
+	movq (%rdi),%rbx
+	adcq %rbx,%rax
+	decl %ecx
+	dest
+	movq %rbx,(%rsi)
+	leaq 8(%rsi),%rsi /* preserve carry */
+	leaq 8(%rdi),%rdi
+	jnz	.Lloop_8
+	adcq %r9,%rax	/* add in carry */
+
+.Lfold:
+	/* reduce checksum to 32bits */
+	movl %eax,%ebx
+	shrq $32,%rax
+	addl %ebx,%eax
+	adcl %r9d,%eax
+
+	/* do last upto 6 bytes */	
+.Lhandle_7:
+	movl %r10d,%ecx
+	andl $7,%ecx
+	shrl $1,%ecx
+	jz   .Lhandle_1
+	movl $2,%edx
+	xorl %ebx,%ebx
+	clc  
+	.p2align 4
+.Lloop_1:	
+	source
+	movw (%rdi),%bx
+	adcl %ebx,%eax
+	dest
+	decl %ecx
+	movw %bx,(%rsi)
+	leaq 2(%rdi),%rdi
+	leaq 2(%rsi),%rsi
+	jnz .Lloop_1
+	adcl %r9d,%eax	/* add in carry */
+	
+	/* handle last odd byte */
+.Lhandle_1:
+	testl $1,%r10d
+	jz    .Lende
+	xorl  %ebx,%ebx
+	source
+	movb (%rdi),%bl
+	dest
+	movb %bl,(%rsi)
+	addl %ebx,%eax
+	adcl %r9d,%eax		/* carry */
+			
+.Lende:
+	movq 2*8(%rsp),%rbx
+	movq 3*8(%rsp),%r12
+	movq 4*8(%rsp),%r14
+	movq 5*8(%rsp),%r13
+	movq 6*8(%rsp),%rbp
+	addq $7*8,%rsp
+	ret
+
+	/* Exception handlers. Very simple, zeroing is done in the wrappers */
+.Lbad_source:
+	movq (%rsp),%rax
+	testq %rax,%rax
+	jz   .Lende
+	movl $-EFAULT,(%rax)
+	jmp  .Lende
+	
+.Lbad_dest:
+	movq 8(%rsp),%rax
+	testq %rax,%rax
+	jz   .Lende	
+	movl $-EFAULT,(%rax)
+	jmp .Lende
diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c
new file mode 100644
index 00000000000..5384e227cdf
--- /dev/null
+++ b/arch/x86_64/lib/csum-partial.c
@@ -0,0 +1,150 @@
+/*
+ * arch/x86_64/lib/csum-partial.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed.
+ */
+ 
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <asm/checksum.h>
+
+#define __force_inline inline __attribute__((always_inline))
+
+static inline unsigned short from32to16(unsigned a) 
+{
+	unsigned short b = a >> 16; 
+	asm("addw %w2,%w0\n\t"
+	    "adcw $0,%w0\n" 
+	    : "=r" (b)
+	    : "0" (b), "r" (a));
+	return b;
+}
+
+/*
+ * Do a 64-bit checksum on an arbitrary memory area.
+ * Returns a 32bit checksum.
+ *
+ * This isn't as time critical as it used to be because many NICs
+ * do hardware checksumming these days.
+ * 
+ * Things tried and found to not make it faster:
+ * Manual Prefetching
+ * Unrolling to an 128 bytes inner loop.
+ * Using interleaving with more registers to break the carry chains.
+ */
+static __force_inline unsigned do_csum(const unsigned char *buff, unsigned len)
+{
+	unsigned odd, count;
+	unsigned long result = 0;
+
+	if (unlikely(len == 0))
+		return result; 
+	odd = 1 & (unsigned long) buff;
+	if (unlikely(odd)) {
+		result = *buff << 8;
+		len--;
+		buff++;
+	}
+	count = len >> 1;		/* nr of 16-bit words.. */
+	if (count) {
+		if (2 & (unsigned long) buff) {
+			result += *(unsigned short *)buff;
+			count--;
+			len -= 2;
+			buff += 2;
+		}
+		count >>= 1;		/* nr of 32-bit words.. */
+		if (count) {
+			unsigned long zero;
+			unsigned count64;
+			if (4 & (unsigned long) buff) {
+				result += *(unsigned int *) buff;
+				count--;
+				len -= 4;
+				buff += 4;
+			}
+			count >>= 1;	/* nr of 64-bit words.. */
+
+			/* main loop using 64byte blocks */
+			zero = 0;
+			count64 = count >> 3;
+			while (count64) { 
+				asm("addq 0*8(%[src]),%[res]\n\t"
+				    "adcq 1*8(%[src]),%[res]\n\t"
+				    "adcq 2*8(%[src]),%[res]\n\t"
+				    "adcq 3*8(%[src]),%[res]\n\t"
+				    "adcq 4*8(%[src]),%[res]\n\t"
+				    "adcq 5*8(%[src]),%[res]\n\t"
+				    "adcq 6*8(%[src]),%[res]\n\t"
+				    "adcq 7*8(%[src]),%[res]\n\t"
+				    "adcq %[zero],%[res]"
+				    : [res] "=r" (result)
+				    : [src] "r" (buff), [zero] "r" (zero),
+				    "[res]" (result));
+				buff += 64;
+				count64--;
+			}
+
+			/* last upto 7 8byte blocks */
+			count %= 8; 
+			while (count) { 
+				asm("addq %1,%0\n\t"
+				    "adcq %2,%0\n" 
+					    : "=r" (result)
+				    : "m" (*(unsigned long *)buff), 
+				    "r" (zero),  "0" (result));
+				--count; 
+					buff += 8;
+			}
+			result = add32_with_carry(result>>32,
+						  result&0xffffffff); 
+
+			if (len & 4) {
+				result += *(unsigned int *) buff;
+				buff += 4;
+			}
+		}
+		if (len & 2) {
+			result += *(unsigned short *) buff;
+			buff += 2;
+		}
+	}
+	if (len & 1)
+		result += *buff;
+	result = add32_with_carry(result>>32, result & 0xffffffff); 
+	if (unlikely(odd)) { 
+		result = from32to16(result);
+		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+	}
+	return result;
+}
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 64-bit boundary
+ */
+unsigned csum_partial(const unsigned char *buff, unsigned len, unsigned sum)
+{
+	return add32_with_carry(do_csum(buff, len), sum); 
+}
+
+EXPORT_SYMBOL(csum_partial);
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+unsigned short ip_compute_csum(unsigned char * buff, int len)
+{
+	return csum_fold(csum_partial(buff,len,0));
+}
+
diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c
new file mode 100644
index 00000000000..94323f20816
--- /dev/null
+++ b/arch/x86_64/lib/csum-wrappers.c
@@ -0,0 +1,129 @@
+/* Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v.2
+ * 
+ * Wrappers of assembly checksum functions for x86-64.
+ */
+
+#include <asm/checksum.h>
+#include <linux/module.h>
+
+/** 
+ * csum_partial_copy_from_user - Copy and checksum from user space. 
+ * @src: source address (user space) 
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad source address.
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits. 
+ */ 
+unsigned int 
+csum_partial_copy_from_user(const unsigned char __user *src, unsigned char *dst,
+			    int len, unsigned int isum, int *errp)
+{ 
+	might_sleep();
+	*errp = 0;
+	if (likely(access_ok(VERIFY_READ,src, len))) { 
+		/* Why 6, not 7? To handle odd addresses aligned we
+		   would need to do considerable complications to fix the
+		   checksum which is defined as an 16bit accumulator. The
+		   fix alignment code is primarily for performance
+		   compatibility with 32bit and that will handle odd
+		   addresses slowly too. */
+		if (unlikely((unsigned long)src & 6)) {			
+			while (((unsigned long)src & 6) && len >= 2) { 
+				__u16 val16;			
+				*errp = __get_user(val16, (__u16 __user *)src); 
+				if (*errp)
+					return isum;
+				*(__u16 *)dst = val16;
+				isum = add32_with_carry(isum, val16); 
+				src += 2; 
+				dst += 2; 
+				len -= 2;
+			}
+		}
+		isum = csum_partial_copy_generic((__force void *)src,dst,len,isum,errp,NULL);
+		if (likely(*errp == 0)) 
+			return isum;
+	} 
+	*errp = -EFAULT;
+	memset(dst,0,len); 
+	return isum;		
+} 
+
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+
+/** 
+ * csum_partial_copy_to_user - Copy and checksum to user space. 
+ * @src: source address
+ * @dst: destination address (user space)
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad destination address.
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits.
+ */ 
+unsigned int 
+csum_partial_copy_to_user(unsigned const char *src, unsigned char __user *dst,
+			  int len, unsigned int isum, int *errp)
+{ 
+	might_sleep();
+	if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
+		*errp = -EFAULT;
+		return 0; 
+	}
+
+	if (unlikely((unsigned long)dst & 6)) {
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/x86_64/lib